anthropic prompt caching cost tracking (#5453)

* fix(utils.py): support 'drop_params' for embedding requests

Fixes https://github.com/BerriAI/litellm/issues/5444

* feat(anthropic/cost_calculation.py): Support calculating cost for prompt caching on anthropic

* feat(types/utils.py): allows us to migrate to openai's equivalent, once that comes out

* fix: fix linting errors

* test: mark flaky test
This commit is contained in:
Krish Dholakia 2024-08-31 14:09:35 -07:00 committed by Ishaan Jaff
parent e6faaba56e
commit 65a9c933ad
17 changed files with 432 additions and 84 deletions

View file

@ -841,10 +841,10 @@ ALL_LITELLM_RESPONSE_TYPES = [
from .types.utils import ImageObject from .types.utils import ImageObject
from .llms.custom_llm import CustomLLM from .llms.custom_llm import CustomLLM
from .llms.huggingface_restapi import HuggingfaceConfig from .llms.huggingface_restapi import HuggingfaceConfig
from .llms.anthropic import AnthropicConfig from .llms.anthropic.chat import AnthropicConfig
from .llms.anthropic.completion import AnthropicTextConfig
from .llms.databricks import DatabricksConfig, DatabricksEmbeddingConfig from .llms.databricks import DatabricksConfig, DatabricksEmbeddingConfig
from .llms.predibase import PredibaseConfig from .llms.predibase import PredibaseConfig
from .llms.anthropic_text import AnthropicTextConfig
from .llms.replicate import ReplicateConfig from .llms.replicate import ReplicateConfig
from .llms.cohere.completion import CohereConfig from .llms.cohere.completion import CohereConfig
from .llms.clarifai import ClarifaiConfig from .llms.clarifai import ClarifaiConfig

View file

@ -19,8 +19,12 @@ from litellm.litellm_core_utils.llm_cost_calc.google import (
cost_router as google_cost_router, cost_router as google_cost_router,
) )
from litellm.litellm_core_utils.llm_cost_calc.utils import _generic_cost_per_character from litellm.litellm_core_utils.llm_cost_calc.utils import _generic_cost_per_character
from litellm.llms.anthropic.cost_calculation import (
cost_per_token as anthropic_cost_per_token,
)
from litellm.types.llms.openai import HttpxBinaryResponseContent from litellm.types.llms.openai import HttpxBinaryResponseContent
from litellm.types.router import SPECIAL_MODEL_INFO_PARAMS from litellm.types.router import SPECIAL_MODEL_INFO_PARAMS
from litellm.types.utils import Usage
from litellm.utils import ( from litellm.utils import (
CallTypes, CallTypes,
CostPerToken, CostPerToken,
@ -59,14 +63,17 @@ def _cost_per_token_custom_pricing_helper(
def cost_per_token( def cost_per_token(
model: str = "", model: str = "",
prompt_tokens: float = 0, prompt_tokens: int = 0,
completion_tokens: float = 0, completion_tokens: int = 0,
response_time_ms=None, response_time_ms=None,
custom_llm_provider: Optional[str] = None, custom_llm_provider: Optional[str] = None,
region_name=None, region_name=None,
### CHARACTER PRICING ### ### CHARACTER PRICING ###
prompt_characters: float = 0, prompt_characters: int = 0,
completion_characters: float = 0, completion_characters: int = 0,
### PROMPT CACHING PRICING ### - used for anthropic
cache_creation_input_tokens: Optional[int] = 0,
cache_read_input_tokens: Optional[int] = 0,
### CUSTOM PRICING ### ### CUSTOM PRICING ###
custom_cost_per_token: Optional[CostPerToken] = None, custom_cost_per_token: Optional[CostPerToken] = None,
custom_cost_per_second: Optional[float] = None, custom_cost_per_second: Optional[float] = None,
@ -108,6 +115,16 @@ def cost_per_token(
""" """
if model is None: if model is None:
raise Exception("Invalid arg. Model cannot be none.") raise Exception("Invalid arg. Model cannot be none.")
## RECONSTRUCT USAGE BLOCK ##
usage_block = Usage(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
total_tokens=prompt_tokens + completion_tokens,
cache_creation_input_tokens=cache_creation_input_tokens,
cache_read_input_tokens=cache_read_input_tokens,
)
## CUSTOM PRICING ## ## CUSTOM PRICING ##
response_cost = _cost_per_token_custom_pricing_helper( response_cost = _cost_per_token_custom_pricing_helper(
prompt_tokens=prompt_tokens, prompt_tokens=prompt_tokens,
@ -137,6 +154,7 @@ def cost_per_token(
model_with_provider = model_with_provider_and_region model_with_provider = model_with_provider_and_region
else: else:
_, custom_llm_provider, _, _ = litellm.get_llm_provider(model=model) _, custom_llm_provider, _, _ = litellm.get_llm_provider(model=model)
model_without_prefix = model model_without_prefix = model
model_parts = model.split("/") model_parts = model.split("/")
if len(model_parts) > 1: if len(model_parts) > 1:
@ -162,6 +180,7 @@ def cost_per_token(
# see this https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models # see this https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models
print_verbose(f"Looking up model={model} in model_cost_map") print_verbose(f"Looking up model={model} in model_cost_map")
if custom_llm_provider == "vertex_ai": if custom_llm_provider == "vertex_ai":
cost_router = google_cost_router( cost_router = google_cost_router(
model=model_without_prefix, model=model_without_prefix,
@ -188,6 +207,8 @@ def cost_per_token(
prompt_tokens=prompt_tokens, prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens, completion_tokens=completion_tokens,
) )
elif custom_llm_provider == "anthropic":
return anthropic_cost_per_token(model=model, usage=usage_block)
elif custom_llm_provider == "gemini": elif custom_llm_provider == "gemini":
return google_cost_per_token( return google_cost_per_token(
model=model_without_prefix, model=model_without_prefix,
@ -520,6 +541,8 @@ def completion_cost(
prompt_characters = 0 prompt_characters = 0
completion_tokens = 0 completion_tokens = 0
completion_characters = 0 completion_characters = 0
cache_creation_input_tokens: Optional[int] = None
cache_read_input_tokens: Optional[int] = None
if completion_response is not None and ( if completion_response is not None and (
isinstance(completion_response, BaseModel) isinstance(completion_response, BaseModel)
or isinstance(completion_response, dict) or isinstance(completion_response, dict)
@ -541,6 +564,13 @@ def completion_cost(
completion_tokens = completion_response.get("usage", {}).get( completion_tokens = completion_response.get("usage", {}).get(
"completion_tokens", 0 "completion_tokens", 0
) )
cache_creation_input_tokens = completion_response.get("usage", {}).get(
"cache_creation_input_tokens", 0
)
cache_read_input_tokens = completion_response.get("usage", {}).get(
"cache_read_input_tokens", 0
)
total_time = getattr(completion_response, "_response_ms", 0) total_time = getattr(completion_response, "_response_ms", 0)
verbose_logger.debug( verbose_logger.debug(
f"completion_response response ms: {getattr(completion_response, '_response_ms', None)} " f"completion_response response ms: {getattr(completion_response, '_response_ms', None)} "
@ -550,7 +580,7 @@ def completion_cost(
) )
if hasattr(completion_response, "_hidden_params"): if hasattr(completion_response, "_hidden_params"):
custom_llm_provider = completion_response._hidden_params.get( custom_llm_provider = completion_response._hidden_params.get(
"custom_llm_provider", custom_llm_provider or "" "custom_llm_provider", custom_llm_provider or None
) )
region_name = completion_response._hidden_params.get( region_name = completion_response._hidden_params.get(
"region_name", region_name "region_name", region_name
@ -697,6 +727,8 @@ def completion_cost(
custom_cost_per_token=custom_cost_per_token, custom_cost_per_token=custom_cost_per_token,
prompt_characters=prompt_characters, prompt_characters=prompt_characters,
completion_characters=completion_characters, completion_characters=completion_characters,
cache_creation_input_tokens=cache_creation_input_tokens,
cache_read_input_tokens=cache_read_input_tokens,
call_type=call_type, call_type=call_type,
) )
_final_cost = prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar _final_cost = prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar

View file

@ -1,3 +1,7 @@
"""
Calling + translation logic for anthropic's `/v1/messages` endpoint
"""
import copy import copy
import json import json
import os import os
@ -70,8 +74,8 @@ from litellm.types.llms.openai import (
from litellm.types.utils import Choices, GenericStreamingChunk from litellm.types.utils import Choices, GenericStreamingChunk
from litellm.utils import CustomStreamWrapper, ModelResponse, Usage from litellm.utils import CustomStreamWrapper, ModelResponse, Usage
from .base import BaseLLM from ..base import BaseLLM
from .prompt_templates.factory import custom_prompt, prompt_factory from ..prompt_templates.factory import custom_prompt, prompt_factory
class AnthropicConstants(Enum): class AnthropicConstants(Enum):
@ -982,7 +986,7 @@ class AnthropicChatCompletion(BaseLLM):
) )
except Exception as e: except Exception as e:
verbose_logger.exception( verbose_logger.exception(
"litellm.llms.anthropic.py::completion() - Exception occurred - {}\nReceived Messages: {}".format( "litellm.llms.anthropic.chat.py::completion() - Exception occurred - {}\nReceived Messages: {}".format(
str(e), messages str(e), messages
) )
) )

View file

@ -1,3 +1,7 @@
"""
Translation logic for anthropic's `/v1/complete` endpoint
"""
import json import json
import os import os
import time import time
@ -12,8 +16,8 @@ import litellm
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
from litellm.utils import CustomStreamWrapper, ModelResponse, Usage from litellm.utils import CustomStreamWrapper, ModelResponse, Usage
from .base import BaseLLM from ..base import BaseLLM
from .prompt_templates.factory import custom_prompt, prompt_factory from ..prompt_templates.factory import custom_prompt, prompt_factory
class AnthropicConstants(Enum): class AnthropicConstants(Enum):

View file

@ -0,0 +1,42 @@
"""
Helper util for handling anthropic-specific cost calculation
- e.g.: prompt caching
"""
from typing import Tuple
from litellm.types.utils import Usage
from litellm.utils import get_model_info
def cost_per_token(model: str, usage: Usage) -> Tuple[float, float]:
"""
Calculates the cost per token for a given model, prompt tokens, and completion tokens.
Input:
- model: str, the model name without provider prefix
- usage: LiteLLM Usage block, containing anthropic caching information
Returns:
Tuple[float, float] - prompt_cost_in_usd, completion_cost_in_usd
"""
## GET MODEL INFO
model_info = get_model_info(model=model, custom_llm_provider="anthropic")
## CALCULATE INPUT COST
prompt_cost: float = usage["prompt_tokens"] * model_info["input_cost_per_token"]
if model_info.get("cache_creation_input_token_cost") is not None:
prompt_cost += (
usage._cache_creation_input_tokens # type: ignore
* model_info["cache_creation_input_token_cost"]
)
if model_info.get("cache_read_input_token_cost") is not None:
prompt_cost += (
usage._cache_read_input_tokens * model_info["cache_read_input_token_cost"] # type: ignore
)
## CALCULATE OUTPUT COST
completion_cost = usage["completion_tokens"] * model_info["output_cost_per_token"]
return prompt_cost, completion_cost

View file

@ -1,11 +1,14 @@
## This is a template base class to be used for adding new LLM providers via API calls ## This is a template base class to be used for adding new LLM providers via API calls
from typing import Any, Optional, Union
import httpx
import requests
import litellm import litellm
import httpx, requests
from typing import Optional, Union
from litellm.litellm_core_utils.litellm_logging import Logging
class BaseLLM: class BaseLLM:
_client_session: Optional[httpx.Client] = None _client_session: Optional[httpx.Client] = None
def process_response( def process_response(
@ -14,7 +17,7 @@ class BaseLLM:
response: Union[requests.Response, httpx.Response], response: Union[requests.Response, httpx.Response],
model_response: litellm.utils.ModelResponse, model_response: litellm.utils.ModelResponse,
stream: bool, stream: bool,
logging_obj: Logging, logging_obj: Any,
optional_params: dict, optional_params: dict,
api_key: str, api_key: str,
data: Union[dict, str], data: Union[dict, str],
@ -33,7 +36,7 @@ class BaseLLM:
response: Union[requests.Response, httpx.Response], response: Union[requests.Response, httpx.Response],
model_response: litellm.utils.TextCompletionResponse, model_response: litellm.utils.TextCompletionResponse,
stream: bool, stream: bool,
logging_obj: Logging, logging_obj: Any,
optional_params: dict, optional_params: dict,
api_key: str, api_key: str,
data: Union[dict, str], data: Union[dict, str],

View file

@ -267,18 +267,19 @@ def completion(
): ):
try: try:
import vertexai import vertexai
from anthropic import AnthropicVertex
from litellm.llms.anthropic import AnthropicChatCompletion
from litellm.llms.vertex_ai_and_google_ai_studio.gemini.vertex_and_google_ai_studio_gemini import (
VertexLLM,
)
except: except:
raise VertexAIError( raise VertexAIError(
status_code=400, status_code=400,
message="""vertexai import failed please run `pip install -U google-cloud-aiplatform "anthropic[vertex]"`""", message="""vertexai import failed please run `pip install -U google-cloud-aiplatform "anthropic[vertex]"`""",
) )
from anthropic import AnthropicVertex
from litellm.llms.anthropic.chat import AnthropicChatCompletion
from litellm.llms.vertex_ai_and_google_ai_studio.gemini.vertex_and_google_ai_studio_gemini import (
VertexLLM,
)
if not ( if not (
hasattr(vertexai, "preview") or hasattr(vertexai.preview, "language_models") hasattr(vertexai, "preview") or hasattr(vertexai.preview, "language_models")
): ):

View file

@ -77,13 +77,10 @@ from .caching import disable_cache, enable_cache, update_cache
from .llms import ( from .llms import (
ai21, ai21,
aleph_alpha, aleph_alpha,
anthropic_text,
baseten, baseten,
bedrock, bedrock,
clarifai, clarifai,
cloudflare, cloudflare,
gemini,
huggingface_restapi,
maritalk, maritalk,
nlp_cloud, nlp_cloud,
ollama, ollama,
@ -93,13 +90,10 @@ from .llms import (
palm, palm,
petals, petals,
replicate, replicate,
together_ai,
triton,
vllm, vllm,
watsonx,
) )
from .llms.anthropic import AnthropicChatCompletion from .llms.anthropic.chat import AnthropicChatCompletion
from .llms.anthropic_text import AnthropicTextCompletion from .llms.anthropic.completion import AnthropicTextCompletion
from .llms.azure import AzureChatCompletion, _check_dynamic_azure_params from .llms.azure import AzureChatCompletion, _check_dynamic_azure_params
from .llms.azure_text import AzureTextCompletion from .llms.azure_text import AzureTextCompletion
from .llms.bedrock_httpx import BedrockConverseLLM, BedrockLLM from .llms.bedrock_httpx import BedrockConverseLLM, BedrockLLM

View file

@ -1336,6 +1336,8 @@
"max_output_tokens": 4096, "max_output_tokens": 4096,
"input_cost_per_token": 0.00000025, "input_cost_per_token": 0.00000025,
"output_cost_per_token": 0.00000125, "output_cost_per_token": 0.00000125,
"cache_creation_input_token_cost": 0.0000003,
"cache_read_input_token_cost": 0.00000003,
"litellm_provider": "anthropic", "litellm_provider": "anthropic",
"mode": "chat", "mode": "chat",
"supports_function_calling": true, "supports_function_calling": true,
@ -1349,6 +1351,8 @@
"max_output_tokens": 4096, "max_output_tokens": 4096,
"input_cost_per_token": 0.000015, "input_cost_per_token": 0.000015,
"output_cost_per_token": 0.000075, "output_cost_per_token": 0.000075,
"cache_creation_input_token_cost": 0.00001875,
"cache_read_input_token_cost": 0.0000015,
"litellm_provider": "anthropic", "litellm_provider": "anthropic",
"mode": "chat", "mode": "chat",
"supports_function_calling": true, "supports_function_calling": true,
@ -1375,6 +1379,8 @@
"max_output_tokens": 4096, "max_output_tokens": 4096,
"input_cost_per_token": 0.000003, "input_cost_per_token": 0.000003,
"output_cost_per_token": 0.000015, "output_cost_per_token": 0.000015,
"cache_creation_input_token_cost": 0.00000375,
"cache_read_input_token_cost": 0.0000003,
"litellm_provider": "anthropic", "litellm_provider": "anthropic",
"mode": "chat", "mode": "chat",
"supports_function_calling": true, "supports_function_calling": true,

View file

@ -1,4 +1,4 @@
model_list: model_list:
- model_name: "gemini/*" - model_name: "gpt-3.5-turbo"
litellm_params: litellm_params:
model: "gemini/*" model: "gpt-3.5-turbo"

View file

@ -10,7 +10,7 @@ from dotenv import load_dotenv
import litellm.types import litellm.types
import litellm.types.utils import litellm.types.utils
from litellm.llms.anthropic import ModelResponseIterator from litellm.llms.anthropic.chat import ModelResponseIterator
load_dotenv() load_dotenv()
import io import io
@ -152,48 +152,190 @@ def test_anthropic_completion_e2e(stream):
anthropic_chunk_list = [ anthropic_chunk_list = [
{"type": "content_block_start", "index": 0, "content_block": {"type": "text", "text": ""}}, {
{"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": "To"}}, "type": "content_block_start",
{"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " answer"}}, "index": 0,
{"type": "content_block_delta", "index": 0, "content_block": {"type": "text", "text": ""},
"delta": {"type": "text_delta", "text": " your question about the weather"}}, },
{"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " in Boston and Los"}}, {
{"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " Angeles today, I'll"}}, "type": "content_block_delta",
{"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " need to"}}, "index": 0,
{"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " use"}}, "delta": {"type": "text_delta", "text": "To"},
{"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " the"}}, },
{"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " get_current_weather"}}, {
{"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " function"}}, "type": "content_block_delta",
{"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " for"}}, "index": 0,
{"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " both"}}, "delta": {"type": "text_delta", "text": " answer"},
{"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " cities"}}, },
{"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": ". Let"}}, {
{"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " me fetch"}}, "type": "content_block_delta",
{"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " that"}}, "index": 0,
{"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " information"}}, "delta": {"type": "text_delta", "text": " your question about the weather"},
{"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " for"}}, },
{"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " you."}}, {
"type": "content_block_delta",
"index": 0,
"delta": {"type": "text_delta", "text": " in Boston and Los"},
},
{
"type": "content_block_delta",
"index": 0,
"delta": {"type": "text_delta", "text": " Angeles today, I'll"},
},
{
"type": "content_block_delta",
"index": 0,
"delta": {"type": "text_delta", "text": " need to"},
},
{
"type": "content_block_delta",
"index": 0,
"delta": {"type": "text_delta", "text": " use"},
},
{
"type": "content_block_delta",
"index": 0,
"delta": {"type": "text_delta", "text": " the"},
},
{
"type": "content_block_delta",
"index": 0,
"delta": {"type": "text_delta", "text": " get_current_weather"},
},
{
"type": "content_block_delta",
"index": 0,
"delta": {"type": "text_delta", "text": " function"},
},
{
"type": "content_block_delta",
"index": 0,
"delta": {"type": "text_delta", "text": " for"},
},
{
"type": "content_block_delta",
"index": 0,
"delta": {"type": "text_delta", "text": " both"},
},
{
"type": "content_block_delta",
"index": 0,
"delta": {"type": "text_delta", "text": " cities"},
},
{
"type": "content_block_delta",
"index": 0,
"delta": {"type": "text_delta", "text": ". Let"},
},
{
"type": "content_block_delta",
"index": 0,
"delta": {"type": "text_delta", "text": " me fetch"},
},
{
"type": "content_block_delta",
"index": 0,
"delta": {"type": "text_delta", "text": " that"},
},
{
"type": "content_block_delta",
"index": 0,
"delta": {"type": "text_delta", "text": " information"},
},
{
"type": "content_block_delta",
"index": 0,
"delta": {"type": "text_delta", "text": " for"},
},
{
"type": "content_block_delta",
"index": 0,
"delta": {"type": "text_delta", "text": " you."},
},
{"type": "content_block_stop", "index": 0}, {"type": "content_block_stop", "index": 0},
{"type": "content_block_start", "index": 1, {
"content_block": {"type": "tool_use", "id": "toolu_12345", "name": "get_current_weather", "input": {}}}, "type": "content_block_start",
{"type": "content_block_delta", "index": 1, "delta": {"type": "input_json_delta", "partial_json": ""}}, "index": 1,
{"type": "content_block_delta", "index": 1, "delta": {"type": "input_json_delta", "partial_json": "{\"locat"}}, "content_block": {
{"type": "content_block_delta", "index": 1, "delta": {"type": "input_json_delta", "partial_json": "ion\": \"Bos"}}, "type": "tool_use",
{"type": "content_block_delta", "index": 1, "delta": {"type": "input_json_delta", "partial_json": "ton, MA\"}"}}, "id": "toolu_12345",
"name": "get_current_weather",
"input": {},
},
},
{
"type": "content_block_delta",
"index": 1,
"delta": {"type": "input_json_delta", "partial_json": ""},
},
{
"type": "content_block_delta",
"index": 1,
"delta": {"type": "input_json_delta", "partial_json": '{"locat'},
},
{
"type": "content_block_delta",
"index": 1,
"delta": {"type": "input_json_delta", "partial_json": 'ion": "Bos'},
},
{
"type": "content_block_delta",
"index": 1,
"delta": {"type": "input_json_delta", "partial_json": 'ton, MA"}'},
},
{"type": "content_block_stop", "index": 1}, {"type": "content_block_stop", "index": 1},
{"type": "content_block_start", "index": 2, {
"content_block": {"type": "tool_use", "id": "toolu_023423423", "name": "get_current_weather", "input": {}}}, "type": "content_block_start",
{"type": "content_block_delta", "index": 2, "delta": {"type": "input_json_delta", "partial_json": ""}}, "index": 2,
{"type": "content_block_delta", "index": 2, "delta": {"type": "input_json_delta", "partial_json": "{\"l"}}, "content_block": {
{"type": "content_block_delta", "index": 2, "delta": {"type": "input_json_delta", "partial_json": "oca"}}, "type": "tool_use",
{"type": "content_block_delta", "index": 2, "delta": {"type": "input_json_delta", "partial_json": "tio"}}, "id": "toolu_023423423",
{"type": "content_block_delta", "index": 2, "delta": {"type": "input_json_delta", "partial_json": "n\": \"Lo"}}, "name": "get_current_weather",
{"type": "content_block_delta", "index": 2, "delta": {"type": "input_json_delta", "partial_json": "s Angel"}}, "input": {},
{"type": "content_block_delta", "index": 2, "delta": {"type": "input_json_delta", "partial_json": "es, CA\"}"}}, },
},
{
"type": "content_block_delta",
"index": 2,
"delta": {"type": "input_json_delta", "partial_json": ""},
},
{
"type": "content_block_delta",
"index": 2,
"delta": {"type": "input_json_delta", "partial_json": '{"l'},
},
{
"type": "content_block_delta",
"index": 2,
"delta": {"type": "input_json_delta", "partial_json": "oca"},
},
{
"type": "content_block_delta",
"index": 2,
"delta": {"type": "input_json_delta", "partial_json": "tio"},
},
{
"type": "content_block_delta",
"index": 2,
"delta": {"type": "input_json_delta", "partial_json": 'n": "Lo'},
},
{
"type": "content_block_delta",
"index": 2,
"delta": {"type": "input_json_delta", "partial_json": "s Angel"},
},
{
"type": "content_block_delta",
"index": 2,
"delta": {"type": "input_json_delta", "partial_json": 'es, CA"}'},
},
{"type": "content_block_stop", "index": 2}, {"type": "content_block_stop", "index": 2},
{"type": "message_delta", "delta": {"stop_reason": "tool_use", "stop_sequence": None}, {
"usage": {"output_tokens": 137}}, "type": "message_delta",
{"type": "message_stop"} "delta": {"stop_reason": "tool_use", "stop_sequence": None},
"usage": {"output_tokens": 137},
},
{"type": "message_stop"},
] ]
@ -211,12 +353,12 @@ def test_anthropic_tool_streaming():
correct_tool_index = -1 correct_tool_index = -1
for chunk in anthropic_chunk_list: for chunk in anthropic_chunk_list:
parsed_chunk = response_iter.chunk_parser(chunk) parsed_chunk = response_iter.chunk_parser(chunk)
if tool_use := parsed_chunk.get('tool_use'): if tool_use := parsed_chunk.get("tool_use"):
# We only increment when a new block starts # We only increment when a new block starts
if tool_use.get('id') is not None: if tool_use.get("id") is not None:
correct_tool_index += 1 correct_tool_index += 1
assert tool_use['index'] == correct_tool_index assert tool_use["index"] == correct_tool_index
@pytest.mark.asyncio @pytest.mark.asyncio
@ -344,4 +486,4 @@ def test_anthropic_tool_calling_translation():
print(translated_params["messages"]) print(translated_params["messages"])
assert len(translated_params["messages"]) > 0 assert len(translated_params["messages"]) > 0
assert translated_params["messages"][0]["role"] == "user" assert translated_params["messages"][0]["role"] == "user"

View file

@ -1097,3 +1097,73 @@ def test_completion_cost_azure_common_deployment_name():
print(f"mock_client.call_args: {mock_client.call_args.kwargs}") print(f"mock_client.call_args: {mock_client.call_args.kwargs}")
assert "azure/gpt-4" == mock_client.call_args.kwargs["model"] assert "azure/gpt-4" == mock_client.call_args.kwargs["model"]
def test_completion_cost_anthropic_prompt_caching():
os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
litellm.model_cost = litellm.get_model_cost_map(url="")
from litellm.utils import Choices, Message, ModelResponse, Usage
model = "anthropic/claude-3-5-sonnet-20240620"
## WRITE TO CACHE ## (MORE EXPENSIVE)
response_1 = ModelResponse(
id="chatcmpl-3f427194-0840-4d08-b571-56bfe38a5424",
choices=[
Choices(
finish_reason="length",
index=0,
message=Message(
content="Hello! I'm doing well, thank you for",
role="assistant",
tool_calls=None,
function_call=None,
),
)
],
created=1725036547,
model="claude-3-5-sonnet-20240620",
object="chat.completion",
system_fingerprint=None,
usage=Usage(
completion_tokens=10,
prompt_tokens=14,
total_tokens=24,
cache_creation_input_tokens=100,
cache_read_input_tokens=0,
),
)
## READ FROM CACHE ## (LESS EXPENSIVE)
response_2 = ModelResponse(
id="chatcmpl-3f427194-0840-4d08-b571-56bfe38a5424",
choices=[
Choices(
finish_reason="length",
index=0,
message=Message(
content="Hello! I'm doing well, thank you for",
role="assistant",
tool_calls=None,
function_call=None,
),
)
],
created=1725036547,
model="claude-3-5-sonnet-20240620",
object="chat.completion",
system_fingerprint=None,
usage=Usage(
completion_tokens=10,
prompt_tokens=14,
total_tokens=24,
cache_creation_input_tokens=0,
cache_read_input_tokens=100,
),
)
cost_1 = completion_cost(model=model, completion_response=response_1)
cost_2 = completion_cost(model=model, completion_response=response_2)
assert cost_1 > cost_2

View file

@ -290,6 +290,7 @@ async def test_base_case(dynamic_rate_limit_handler, mock_response):
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.flaky(retries=3, delay=1)
async def test_update_cache( async def test_update_cache(
dynamic_rate_limit_handler, mock_response, user_api_key_auth dynamic_rate_limit_handler, mock_response, user_api_key_auth
): ):

View file

@ -75,6 +75,16 @@ def test_bedrock_optional_params_embeddings():
assert len(optional_params) == 0 assert len(optional_params) == 0
def test_google_ai_studio_optional_params_embeddings():
optional_params = get_optional_params_embeddings(
user="John",
encoding_format=None,
custom_llm_provider="gemini",
drop_params=True,
)
assert len(optional_params) == 0
def test_openai_optional_params_embeddings(): def test_openai_optional_params_embeddings():
litellm.drop_params = True litellm.drop_params = True
optional_params = get_optional_params_embeddings( optional_params = get_optional_params_embeddings(

View file

@ -51,6 +51,8 @@ class ModelInfo(TypedDict, total=False):
max_input_tokens: Required[Optional[int]] max_input_tokens: Required[Optional[int]]
max_output_tokens: Required[Optional[int]] max_output_tokens: Required[Optional[int]]
input_cost_per_token: Required[float] input_cost_per_token: Required[float]
cache_creation_input_token_cost: Optional[float]
cache_read_input_token_cost: Optional[float]
input_cost_per_character: Optional[float] # only for vertex ai models input_cost_per_character: Optional[float] # only for vertex ai models
input_cost_per_token_above_128k_tokens: Optional[float] # only for vertex ai models input_cost_per_token_above_128k_tokens: Optional[float] # only for vertex ai models
input_cost_per_character_above_128k_tokens: Optional[ input_cost_per_character_above_128k_tokens: Optional[
@ -454,6 +456,13 @@ class Choices(OpenAIObject):
class Usage(CompletionUsage): class Usage(CompletionUsage):
_cache_creation_input_tokens: int = PrivateAttr(
0
) # hidden param for prompt caching. Might change, once openai introduces their equivalent.
_cache_read_input_tokens: int = PrivateAttr(
0
) # hidden param for prompt caching. Might change, once openai introduces their equivalent.
def __init__( def __init__(
self, self,
prompt_tokens: Optional[int] = None, prompt_tokens: Optional[int] = None,
@ -466,9 +475,18 @@ class Usage(CompletionUsage):
"completion_tokens": completion_tokens or 0, "completion_tokens": completion_tokens or 0,
"total_tokens": total_tokens or 0, "total_tokens": total_tokens or 0,
} }
super().__init__(**data) super().__init__(**data)
if "cache_creation_input_tokens" in params and isinstance(
params["cache_creation_input_tokens"], int
):
self._cache_creation_input_tokens = params["cache_creation_input_tokens"]
if "cache_read_input_tokens" in params and isinstance(
params["cache_read_input_tokens"], int
):
self._cache_read_input_tokens = params["cache_read_input_tokens"]
for k, v in params.items(): for k, v in params.items():
setattr(self, k, v) setattr(self, k, v)

View file

@ -2550,6 +2550,7 @@ def get_optional_params_embeddings(
encoding_format=None, encoding_format=None,
dimensions=None, dimensions=None,
custom_llm_provider="", custom_llm_provider="",
drop_params: Optional[bool] = None,
additional_drop_params: Optional[bool] = None, additional_drop_params: Optional[bool] = None,
**kwargs, **kwargs,
): ):
@ -2560,6 +2561,7 @@ def get_optional_params_embeddings(
for k, v in special_params.items(): for k, v in special_params.items():
passed_params[k] = v passed_params[k] = v
drop_params = passed_params.pop("drop_params", None)
additional_drop_params = passed_params.pop("additional_drop_params", None) additional_drop_params = passed_params.pop("additional_drop_params", None)
default_params = {"user": None, "encoding_format": None, "dimensions": None} default_params = {"user": None, "encoding_format": None, "dimensions": None}
@ -2571,11 +2573,16 @@ def get_optional_params_embeddings(
for k in non_default_params.keys(): for k in non_default_params.keys():
if k not in supported_params: if k not in supported_params:
unsupported_params[k] = non_default_params[k] unsupported_params[k] = non_default_params[k]
if unsupported_params and not litellm.drop_params: if unsupported_params:
raise UnsupportedParamsError( if litellm.drop_params is True or (
status_code=500, drop_params is not None and drop_params is True
message=f"{custom_llm_provider} does not support parameters: {unsupported_params}, for model={model}. To drop these, set `litellm.drop_params=True` or for proxy:\n\n`litellm_settings:\n drop_params: true`\n", ):
) pass
else:
raise UnsupportedParamsError(
status_code=500,
message=f"{custom_llm_provider} does not support parameters: {unsupported_params}, for model={model}. To drop these, set `litellm.drop_params=True` or for proxy:\n\n`litellm_settings:\n drop_params: true`\n",
)
non_default_params = _get_non_default_params( non_default_params = _get_non_default_params(
passed_params=passed_params, passed_params=passed_params,
@ -2680,7 +2687,9 @@ def get_optional_params_embeddings(
and custom_llm_provider not in litellm.openai_compatible_providers and custom_llm_provider not in litellm.openai_compatible_providers
): ):
if len(non_default_params.keys()) > 0: if len(non_default_params.keys()) > 0:
if litellm.drop_params is True: # drop the unsupported non-default values if (
litellm.drop_params is True or drop_params is True
): # drop the unsupported non-default values
keys = list(non_default_params.keys()) keys = list(non_default_params.keys())
for k in keys: for k in keys:
non_default_params.pop(k, None) non_default_params.pop(k, None)
@ -5335,6 +5344,12 @@ def get_model_info(model: str, custom_llm_provider: Optional[str] = None) -> Mod
max_input_tokens=_model_info.get("max_input_tokens", None), max_input_tokens=_model_info.get("max_input_tokens", None),
max_output_tokens=_model_info.get("max_output_tokens", None), max_output_tokens=_model_info.get("max_output_tokens", None),
input_cost_per_token=_input_cost_per_token, input_cost_per_token=_input_cost_per_token,
cache_creation_input_token_cost=_model_info.get(
"cache_creation_input_token_cost", None
),
cache_read_input_token_cost=_model_info.get(
"cache_read_input_token_cost", None
),
input_cost_per_character=_model_info.get( input_cost_per_character=_model_info.get(
"input_cost_per_character", None "input_cost_per_character", None
), ),

View file

@ -1336,6 +1336,8 @@
"max_output_tokens": 4096, "max_output_tokens": 4096,
"input_cost_per_token": 0.00000025, "input_cost_per_token": 0.00000025,
"output_cost_per_token": 0.00000125, "output_cost_per_token": 0.00000125,
"cache_creation_input_token_cost": 0.0000003,
"cache_read_input_token_cost": 0.00000003,
"litellm_provider": "anthropic", "litellm_provider": "anthropic",
"mode": "chat", "mode": "chat",
"supports_function_calling": true, "supports_function_calling": true,
@ -1349,6 +1351,8 @@
"max_output_tokens": 4096, "max_output_tokens": 4096,
"input_cost_per_token": 0.000015, "input_cost_per_token": 0.000015,
"output_cost_per_token": 0.000075, "output_cost_per_token": 0.000075,
"cache_creation_input_token_cost": 0.00001875,
"cache_read_input_token_cost": 0.0000015,
"litellm_provider": "anthropic", "litellm_provider": "anthropic",
"mode": "chat", "mode": "chat",
"supports_function_calling": true, "supports_function_calling": true,
@ -1375,6 +1379,8 @@
"max_output_tokens": 4096, "max_output_tokens": 4096,
"input_cost_per_token": 0.000003, "input_cost_per_token": 0.000003,
"output_cost_per_token": 0.000015, "output_cost_per_token": 0.000015,
"cache_creation_input_token_cost": 0.00000375,
"cache_read_input_token_cost": 0.0000003,
"litellm_provider": "anthropic", "litellm_provider": "anthropic",
"mode": "chat", "mode": "chat",
"supports_function_calling": true, "supports_function_calling": true,