forked from phoenix/litellm-mirror
anthropic prompt caching cost tracking (#5453)
* fix(utils.py): support 'drop_params' for embedding requests Fixes https://github.com/BerriAI/litellm/issues/5444 * feat(anthropic/cost_calculation.py): Support calculating cost for prompt caching on anthropic * feat(types/utils.py): allows us to migrate to openai's equivalent, once that comes out * fix: fix linting errors * test: mark flaky test
This commit is contained in:
parent
e6faaba56e
commit
65a9c933ad
17 changed files with 432 additions and 84 deletions
|
@ -841,10 +841,10 @@ ALL_LITELLM_RESPONSE_TYPES = [
|
|||
from .types.utils import ImageObject
|
||||
from .llms.custom_llm import CustomLLM
|
||||
from .llms.huggingface_restapi import HuggingfaceConfig
|
||||
from .llms.anthropic import AnthropicConfig
|
||||
from .llms.anthropic.chat import AnthropicConfig
|
||||
from .llms.anthropic.completion import AnthropicTextConfig
|
||||
from .llms.databricks import DatabricksConfig, DatabricksEmbeddingConfig
|
||||
from .llms.predibase import PredibaseConfig
|
||||
from .llms.anthropic_text import AnthropicTextConfig
|
||||
from .llms.replicate import ReplicateConfig
|
||||
from .llms.cohere.completion import CohereConfig
|
||||
from .llms.clarifai import ClarifaiConfig
|
||||
|
|
|
@ -19,8 +19,12 @@ from litellm.litellm_core_utils.llm_cost_calc.google import (
|
|||
cost_router as google_cost_router,
|
||||
)
|
||||
from litellm.litellm_core_utils.llm_cost_calc.utils import _generic_cost_per_character
|
||||
from litellm.llms.anthropic.cost_calculation import (
|
||||
cost_per_token as anthropic_cost_per_token,
|
||||
)
|
||||
from litellm.types.llms.openai import HttpxBinaryResponseContent
|
||||
from litellm.types.router import SPECIAL_MODEL_INFO_PARAMS
|
||||
from litellm.types.utils import Usage
|
||||
from litellm.utils import (
|
||||
CallTypes,
|
||||
CostPerToken,
|
||||
|
@ -59,14 +63,17 @@ def _cost_per_token_custom_pricing_helper(
|
|||
|
||||
def cost_per_token(
|
||||
model: str = "",
|
||||
prompt_tokens: float = 0,
|
||||
completion_tokens: float = 0,
|
||||
prompt_tokens: int = 0,
|
||||
completion_tokens: int = 0,
|
||||
response_time_ms=None,
|
||||
custom_llm_provider: Optional[str] = None,
|
||||
region_name=None,
|
||||
### CHARACTER PRICING ###
|
||||
prompt_characters: float = 0,
|
||||
completion_characters: float = 0,
|
||||
prompt_characters: int = 0,
|
||||
completion_characters: int = 0,
|
||||
### PROMPT CACHING PRICING ### - used for anthropic
|
||||
cache_creation_input_tokens: Optional[int] = 0,
|
||||
cache_read_input_tokens: Optional[int] = 0,
|
||||
### CUSTOM PRICING ###
|
||||
custom_cost_per_token: Optional[CostPerToken] = None,
|
||||
custom_cost_per_second: Optional[float] = None,
|
||||
|
@ -108,6 +115,16 @@ def cost_per_token(
|
|||
"""
|
||||
if model is None:
|
||||
raise Exception("Invalid arg. Model cannot be none.")
|
||||
|
||||
## RECONSTRUCT USAGE BLOCK ##
|
||||
usage_block = Usage(
|
||||
prompt_tokens=prompt_tokens,
|
||||
completion_tokens=completion_tokens,
|
||||
total_tokens=prompt_tokens + completion_tokens,
|
||||
cache_creation_input_tokens=cache_creation_input_tokens,
|
||||
cache_read_input_tokens=cache_read_input_tokens,
|
||||
)
|
||||
|
||||
## CUSTOM PRICING ##
|
||||
response_cost = _cost_per_token_custom_pricing_helper(
|
||||
prompt_tokens=prompt_tokens,
|
||||
|
@ -137,6 +154,7 @@ def cost_per_token(
|
|||
model_with_provider = model_with_provider_and_region
|
||||
else:
|
||||
_, custom_llm_provider, _, _ = litellm.get_llm_provider(model=model)
|
||||
|
||||
model_without_prefix = model
|
||||
model_parts = model.split("/")
|
||||
if len(model_parts) > 1:
|
||||
|
@ -162,6 +180,7 @@ def cost_per_token(
|
|||
|
||||
# see this https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models
|
||||
print_verbose(f"Looking up model={model} in model_cost_map")
|
||||
|
||||
if custom_llm_provider == "vertex_ai":
|
||||
cost_router = google_cost_router(
|
||||
model=model_without_prefix,
|
||||
|
@ -188,6 +207,8 @@ def cost_per_token(
|
|||
prompt_tokens=prompt_tokens,
|
||||
completion_tokens=completion_tokens,
|
||||
)
|
||||
elif custom_llm_provider == "anthropic":
|
||||
return anthropic_cost_per_token(model=model, usage=usage_block)
|
||||
elif custom_llm_provider == "gemini":
|
||||
return google_cost_per_token(
|
||||
model=model_without_prefix,
|
||||
|
@ -520,6 +541,8 @@ def completion_cost(
|
|||
prompt_characters = 0
|
||||
completion_tokens = 0
|
||||
completion_characters = 0
|
||||
cache_creation_input_tokens: Optional[int] = None
|
||||
cache_read_input_tokens: Optional[int] = None
|
||||
if completion_response is not None and (
|
||||
isinstance(completion_response, BaseModel)
|
||||
or isinstance(completion_response, dict)
|
||||
|
@ -541,6 +564,13 @@ def completion_cost(
|
|||
completion_tokens = completion_response.get("usage", {}).get(
|
||||
"completion_tokens", 0
|
||||
)
|
||||
cache_creation_input_tokens = completion_response.get("usage", {}).get(
|
||||
"cache_creation_input_tokens", 0
|
||||
)
|
||||
cache_read_input_tokens = completion_response.get("usage", {}).get(
|
||||
"cache_read_input_tokens", 0
|
||||
)
|
||||
|
||||
total_time = getattr(completion_response, "_response_ms", 0)
|
||||
verbose_logger.debug(
|
||||
f"completion_response response ms: {getattr(completion_response, '_response_ms', None)} "
|
||||
|
@ -550,7 +580,7 @@ def completion_cost(
|
|||
)
|
||||
if hasattr(completion_response, "_hidden_params"):
|
||||
custom_llm_provider = completion_response._hidden_params.get(
|
||||
"custom_llm_provider", custom_llm_provider or ""
|
||||
"custom_llm_provider", custom_llm_provider or None
|
||||
)
|
||||
region_name = completion_response._hidden_params.get(
|
||||
"region_name", region_name
|
||||
|
@ -697,6 +727,8 @@ def completion_cost(
|
|||
custom_cost_per_token=custom_cost_per_token,
|
||||
prompt_characters=prompt_characters,
|
||||
completion_characters=completion_characters,
|
||||
cache_creation_input_tokens=cache_creation_input_tokens,
|
||||
cache_read_input_tokens=cache_read_input_tokens,
|
||||
call_type=call_type,
|
||||
)
|
||||
_final_cost = prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar
|
||||
|
|
|
@ -1,3 +1,7 @@
|
|||
"""
|
||||
Calling + translation logic for anthropic's `/v1/messages` endpoint
|
||||
"""
|
||||
|
||||
import copy
|
||||
import json
|
||||
import os
|
||||
|
@ -70,8 +74,8 @@ from litellm.types.llms.openai import (
|
|||
from litellm.types.utils import Choices, GenericStreamingChunk
|
||||
from litellm.utils import CustomStreamWrapper, ModelResponse, Usage
|
||||
|
||||
from .base import BaseLLM
|
||||
from .prompt_templates.factory import custom_prompt, prompt_factory
|
||||
from ..base import BaseLLM
|
||||
from ..prompt_templates.factory import custom_prompt, prompt_factory
|
||||
|
||||
|
||||
class AnthropicConstants(Enum):
|
||||
|
@ -982,7 +986,7 @@ class AnthropicChatCompletion(BaseLLM):
|
|||
)
|
||||
except Exception as e:
|
||||
verbose_logger.exception(
|
||||
"litellm.llms.anthropic.py::completion() - Exception occurred - {}\nReceived Messages: {}".format(
|
||||
"litellm.llms.anthropic.chat.py::completion() - Exception occurred - {}\nReceived Messages: {}".format(
|
||||
str(e), messages
|
||||
)
|
||||
)
|
|
@ -1,3 +1,7 @@
|
|||
"""
|
||||
Translation logic for anthropic's `/v1/complete` endpoint
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
|
@ -12,8 +16,8 @@ import litellm
|
|||
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
|
||||
from litellm.utils import CustomStreamWrapper, ModelResponse, Usage
|
||||
|
||||
from .base import BaseLLM
|
||||
from .prompt_templates.factory import custom_prompt, prompt_factory
|
||||
from ..base import BaseLLM
|
||||
from ..prompt_templates.factory import custom_prompt, prompt_factory
|
||||
|
||||
|
||||
class AnthropicConstants(Enum):
|
42
litellm/llms/anthropic/cost_calculation.py
Normal file
42
litellm/llms/anthropic/cost_calculation.py
Normal file
|
@ -0,0 +1,42 @@
|
|||
"""
|
||||
Helper util for handling anthropic-specific cost calculation
|
||||
- e.g.: prompt caching
|
||||
"""
|
||||
|
||||
from typing import Tuple
|
||||
|
||||
from litellm.types.utils import Usage
|
||||
from litellm.utils import get_model_info
|
||||
|
||||
|
||||
def cost_per_token(model: str, usage: Usage) -> Tuple[float, float]:
|
||||
"""
|
||||
Calculates the cost per token for a given model, prompt tokens, and completion tokens.
|
||||
|
||||
Input:
|
||||
- model: str, the model name without provider prefix
|
||||
- usage: LiteLLM Usage block, containing anthropic caching information
|
||||
|
||||
Returns:
|
||||
Tuple[float, float] - prompt_cost_in_usd, completion_cost_in_usd
|
||||
"""
|
||||
## GET MODEL INFO
|
||||
model_info = get_model_info(model=model, custom_llm_provider="anthropic")
|
||||
|
||||
## CALCULATE INPUT COST
|
||||
|
||||
prompt_cost: float = usage["prompt_tokens"] * model_info["input_cost_per_token"]
|
||||
if model_info.get("cache_creation_input_token_cost") is not None:
|
||||
prompt_cost += (
|
||||
usage._cache_creation_input_tokens # type: ignore
|
||||
* model_info["cache_creation_input_token_cost"]
|
||||
)
|
||||
if model_info.get("cache_read_input_token_cost") is not None:
|
||||
prompt_cost += (
|
||||
usage._cache_read_input_tokens * model_info["cache_read_input_token_cost"] # type: ignore
|
||||
)
|
||||
|
||||
## CALCULATE OUTPUT COST
|
||||
completion_cost = usage["completion_tokens"] * model_info["output_cost_per_token"]
|
||||
|
||||
return prompt_cost, completion_cost
|
|
@ -1,11 +1,14 @@
|
|||
## This is a template base class to be used for adding new LLM providers via API calls
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
import httpx
|
||||
import requests
|
||||
|
||||
import litellm
|
||||
import httpx, requests
|
||||
from typing import Optional, Union
|
||||
from litellm.litellm_core_utils.litellm_logging import Logging
|
||||
|
||||
|
||||
class BaseLLM:
|
||||
|
||||
_client_session: Optional[httpx.Client] = None
|
||||
|
||||
def process_response(
|
||||
|
@ -14,7 +17,7 @@ class BaseLLM:
|
|||
response: Union[requests.Response, httpx.Response],
|
||||
model_response: litellm.utils.ModelResponse,
|
||||
stream: bool,
|
||||
logging_obj: Logging,
|
||||
logging_obj: Any,
|
||||
optional_params: dict,
|
||||
api_key: str,
|
||||
data: Union[dict, str],
|
||||
|
@ -33,7 +36,7 @@ class BaseLLM:
|
|||
response: Union[requests.Response, httpx.Response],
|
||||
model_response: litellm.utils.TextCompletionResponse,
|
||||
stream: bool,
|
||||
logging_obj: Logging,
|
||||
logging_obj: Any,
|
||||
optional_params: dict,
|
||||
api_key: str,
|
||||
data: Union[dict, str],
|
||||
|
|
|
@ -267,18 +267,19 @@ def completion(
|
|||
):
|
||||
try:
|
||||
import vertexai
|
||||
from anthropic import AnthropicVertex
|
||||
|
||||
from litellm.llms.anthropic import AnthropicChatCompletion
|
||||
from litellm.llms.vertex_ai_and_google_ai_studio.gemini.vertex_and_google_ai_studio_gemini import (
|
||||
VertexLLM,
|
||||
)
|
||||
except:
|
||||
raise VertexAIError(
|
||||
status_code=400,
|
||||
message="""vertexai import failed please run `pip install -U google-cloud-aiplatform "anthropic[vertex]"`""",
|
||||
)
|
||||
|
||||
from anthropic import AnthropicVertex
|
||||
|
||||
from litellm.llms.anthropic.chat import AnthropicChatCompletion
|
||||
from litellm.llms.vertex_ai_and_google_ai_studio.gemini.vertex_and_google_ai_studio_gemini import (
|
||||
VertexLLM,
|
||||
)
|
||||
|
||||
if not (
|
||||
hasattr(vertexai, "preview") or hasattr(vertexai.preview, "language_models")
|
||||
):
|
||||
|
|
|
@ -77,13 +77,10 @@ from .caching import disable_cache, enable_cache, update_cache
|
|||
from .llms import (
|
||||
ai21,
|
||||
aleph_alpha,
|
||||
anthropic_text,
|
||||
baseten,
|
||||
bedrock,
|
||||
clarifai,
|
||||
cloudflare,
|
||||
gemini,
|
||||
huggingface_restapi,
|
||||
maritalk,
|
||||
nlp_cloud,
|
||||
ollama,
|
||||
|
@ -93,13 +90,10 @@ from .llms import (
|
|||
palm,
|
||||
petals,
|
||||
replicate,
|
||||
together_ai,
|
||||
triton,
|
||||
vllm,
|
||||
watsonx,
|
||||
)
|
||||
from .llms.anthropic import AnthropicChatCompletion
|
||||
from .llms.anthropic_text import AnthropicTextCompletion
|
||||
from .llms.anthropic.chat import AnthropicChatCompletion
|
||||
from .llms.anthropic.completion import AnthropicTextCompletion
|
||||
from .llms.azure import AzureChatCompletion, _check_dynamic_azure_params
|
||||
from .llms.azure_text import AzureTextCompletion
|
||||
from .llms.bedrock_httpx import BedrockConverseLLM, BedrockLLM
|
||||
|
|
|
@ -1336,6 +1336,8 @@
|
|||
"max_output_tokens": 4096,
|
||||
"input_cost_per_token": 0.00000025,
|
||||
"output_cost_per_token": 0.00000125,
|
||||
"cache_creation_input_token_cost": 0.0000003,
|
||||
"cache_read_input_token_cost": 0.00000003,
|
||||
"litellm_provider": "anthropic",
|
||||
"mode": "chat",
|
||||
"supports_function_calling": true,
|
||||
|
@ -1349,6 +1351,8 @@
|
|||
"max_output_tokens": 4096,
|
||||
"input_cost_per_token": 0.000015,
|
||||
"output_cost_per_token": 0.000075,
|
||||
"cache_creation_input_token_cost": 0.00001875,
|
||||
"cache_read_input_token_cost": 0.0000015,
|
||||
"litellm_provider": "anthropic",
|
||||
"mode": "chat",
|
||||
"supports_function_calling": true,
|
||||
|
@ -1375,6 +1379,8 @@
|
|||
"max_output_tokens": 4096,
|
||||
"input_cost_per_token": 0.000003,
|
||||
"output_cost_per_token": 0.000015,
|
||||
"cache_creation_input_token_cost": 0.00000375,
|
||||
"cache_read_input_token_cost": 0.0000003,
|
||||
"litellm_provider": "anthropic",
|
||||
"mode": "chat",
|
||||
"supports_function_calling": true,
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
model_list:
|
||||
- model_name: "gemini/*"
|
||||
- model_name: "gpt-3.5-turbo"
|
||||
litellm_params:
|
||||
model: "gemini/*"
|
||||
model: "gpt-3.5-turbo"
|
|
@ -10,7 +10,7 @@ from dotenv import load_dotenv
|
|||
|
||||
import litellm.types
|
||||
import litellm.types.utils
|
||||
from litellm.llms.anthropic import ModelResponseIterator
|
||||
from litellm.llms.anthropic.chat import ModelResponseIterator
|
||||
|
||||
load_dotenv()
|
||||
import io
|
||||
|
@ -152,48 +152,190 @@ def test_anthropic_completion_e2e(stream):
|
|||
|
||||
|
||||
anthropic_chunk_list = [
|
||||
{"type": "content_block_start", "index": 0, "content_block": {"type": "text", "text": ""}},
|
||||
{"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": "To"}},
|
||||
{"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " answer"}},
|
||||
{"type": "content_block_delta", "index": 0,
|
||||
"delta": {"type": "text_delta", "text": " your question about the weather"}},
|
||||
{"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " in Boston and Los"}},
|
||||
{"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " Angeles today, I'll"}},
|
||||
{"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " need to"}},
|
||||
{"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " use"}},
|
||||
{"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " the"}},
|
||||
{"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " get_current_weather"}},
|
||||
{"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " function"}},
|
||||
{"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " for"}},
|
||||
{"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " both"}},
|
||||
{"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " cities"}},
|
||||
{"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": ". Let"}},
|
||||
{"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " me fetch"}},
|
||||
{"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " that"}},
|
||||
{"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " information"}},
|
||||
{"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " for"}},
|
||||
{"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " you."}},
|
||||
{
|
||||
"type": "content_block_start",
|
||||
"index": 0,
|
||||
"content_block": {"type": "text", "text": ""},
|
||||
},
|
||||
{
|
||||
"type": "content_block_delta",
|
||||
"index": 0,
|
||||
"delta": {"type": "text_delta", "text": "To"},
|
||||
},
|
||||
{
|
||||
"type": "content_block_delta",
|
||||
"index": 0,
|
||||
"delta": {"type": "text_delta", "text": " answer"},
|
||||
},
|
||||
{
|
||||
"type": "content_block_delta",
|
||||
"index": 0,
|
||||
"delta": {"type": "text_delta", "text": " your question about the weather"},
|
||||
},
|
||||
{
|
||||
"type": "content_block_delta",
|
||||
"index": 0,
|
||||
"delta": {"type": "text_delta", "text": " in Boston and Los"},
|
||||
},
|
||||
{
|
||||
"type": "content_block_delta",
|
||||
"index": 0,
|
||||
"delta": {"type": "text_delta", "text": " Angeles today, I'll"},
|
||||
},
|
||||
{
|
||||
"type": "content_block_delta",
|
||||
"index": 0,
|
||||
"delta": {"type": "text_delta", "text": " need to"},
|
||||
},
|
||||
{
|
||||
"type": "content_block_delta",
|
||||
"index": 0,
|
||||
"delta": {"type": "text_delta", "text": " use"},
|
||||
},
|
||||
{
|
||||
"type": "content_block_delta",
|
||||
"index": 0,
|
||||
"delta": {"type": "text_delta", "text": " the"},
|
||||
},
|
||||
{
|
||||
"type": "content_block_delta",
|
||||
"index": 0,
|
||||
"delta": {"type": "text_delta", "text": " get_current_weather"},
|
||||
},
|
||||
{
|
||||
"type": "content_block_delta",
|
||||
"index": 0,
|
||||
"delta": {"type": "text_delta", "text": " function"},
|
||||
},
|
||||
{
|
||||
"type": "content_block_delta",
|
||||
"index": 0,
|
||||
"delta": {"type": "text_delta", "text": " for"},
|
||||
},
|
||||
{
|
||||
"type": "content_block_delta",
|
||||
"index": 0,
|
||||
"delta": {"type": "text_delta", "text": " both"},
|
||||
},
|
||||
{
|
||||
"type": "content_block_delta",
|
||||
"index": 0,
|
||||
"delta": {"type": "text_delta", "text": " cities"},
|
||||
},
|
||||
{
|
||||
"type": "content_block_delta",
|
||||
"index": 0,
|
||||
"delta": {"type": "text_delta", "text": ". Let"},
|
||||
},
|
||||
{
|
||||
"type": "content_block_delta",
|
||||
"index": 0,
|
||||
"delta": {"type": "text_delta", "text": " me fetch"},
|
||||
},
|
||||
{
|
||||
"type": "content_block_delta",
|
||||
"index": 0,
|
||||
"delta": {"type": "text_delta", "text": " that"},
|
||||
},
|
||||
{
|
||||
"type": "content_block_delta",
|
||||
"index": 0,
|
||||
"delta": {"type": "text_delta", "text": " information"},
|
||||
},
|
||||
{
|
||||
"type": "content_block_delta",
|
||||
"index": 0,
|
||||
"delta": {"type": "text_delta", "text": " for"},
|
||||
},
|
||||
{
|
||||
"type": "content_block_delta",
|
||||
"index": 0,
|
||||
"delta": {"type": "text_delta", "text": " you."},
|
||||
},
|
||||
{"type": "content_block_stop", "index": 0},
|
||||
{"type": "content_block_start", "index": 1,
|
||||
"content_block": {"type": "tool_use", "id": "toolu_12345", "name": "get_current_weather", "input": {}}},
|
||||
{"type": "content_block_delta", "index": 1, "delta": {"type": "input_json_delta", "partial_json": ""}},
|
||||
{"type": "content_block_delta", "index": 1, "delta": {"type": "input_json_delta", "partial_json": "{\"locat"}},
|
||||
{"type": "content_block_delta", "index": 1, "delta": {"type": "input_json_delta", "partial_json": "ion\": \"Bos"}},
|
||||
{"type": "content_block_delta", "index": 1, "delta": {"type": "input_json_delta", "partial_json": "ton, MA\"}"}},
|
||||
{
|
||||
"type": "content_block_start",
|
||||
"index": 1,
|
||||
"content_block": {
|
||||
"type": "tool_use",
|
||||
"id": "toolu_12345",
|
||||
"name": "get_current_weather",
|
||||
"input": {},
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "content_block_delta",
|
||||
"index": 1,
|
||||
"delta": {"type": "input_json_delta", "partial_json": ""},
|
||||
},
|
||||
{
|
||||
"type": "content_block_delta",
|
||||
"index": 1,
|
||||
"delta": {"type": "input_json_delta", "partial_json": '{"locat'},
|
||||
},
|
||||
{
|
||||
"type": "content_block_delta",
|
||||
"index": 1,
|
||||
"delta": {"type": "input_json_delta", "partial_json": 'ion": "Bos'},
|
||||
},
|
||||
{
|
||||
"type": "content_block_delta",
|
||||
"index": 1,
|
||||
"delta": {"type": "input_json_delta", "partial_json": 'ton, MA"}'},
|
||||
},
|
||||
{"type": "content_block_stop", "index": 1},
|
||||
{"type": "content_block_start", "index": 2,
|
||||
"content_block": {"type": "tool_use", "id": "toolu_023423423", "name": "get_current_weather", "input": {}}},
|
||||
{"type": "content_block_delta", "index": 2, "delta": {"type": "input_json_delta", "partial_json": ""}},
|
||||
{"type": "content_block_delta", "index": 2, "delta": {"type": "input_json_delta", "partial_json": "{\"l"}},
|
||||
{"type": "content_block_delta", "index": 2, "delta": {"type": "input_json_delta", "partial_json": "oca"}},
|
||||
{"type": "content_block_delta", "index": 2, "delta": {"type": "input_json_delta", "partial_json": "tio"}},
|
||||
{"type": "content_block_delta", "index": 2, "delta": {"type": "input_json_delta", "partial_json": "n\": \"Lo"}},
|
||||
{"type": "content_block_delta", "index": 2, "delta": {"type": "input_json_delta", "partial_json": "s Angel"}},
|
||||
{"type": "content_block_delta", "index": 2, "delta": {"type": "input_json_delta", "partial_json": "es, CA\"}"}},
|
||||
{
|
||||
"type": "content_block_start",
|
||||
"index": 2,
|
||||
"content_block": {
|
||||
"type": "tool_use",
|
||||
"id": "toolu_023423423",
|
||||
"name": "get_current_weather",
|
||||
"input": {},
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "content_block_delta",
|
||||
"index": 2,
|
||||
"delta": {"type": "input_json_delta", "partial_json": ""},
|
||||
},
|
||||
{
|
||||
"type": "content_block_delta",
|
||||
"index": 2,
|
||||
"delta": {"type": "input_json_delta", "partial_json": '{"l'},
|
||||
},
|
||||
{
|
||||
"type": "content_block_delta",
|
||||
"index": 2,
|
||||
"delta": {"type": "input_json_delta", "partial_json": "oca"},
|
||||
},
|
||||
{
|
||||
"type": "content_block_delta",
|
||||
"index": 2,
|
||||
"delta": {"type": "input_json_delta", "partial_json": "tio"},
|
||||
},
|
||||
{
|
||||
"type": "content_block_delta",
|
||||
"index": 2,
|
||||
"delta": {"type": "input_json_delta", "partial_json": 'n": "Lo'},
|
||||
},
|
||||
{
|
||||
"type": "content_block_delta",
|
||||
"index": 2,
|
||||
"delta": {"type": "input_json_delta", "partial_json": "s Angel"},
|
||||
},
|
||||
{
|
||||
"type": "content_block_delta",
|
||||
"index": 2,
|
||||
"delta": {"type": "input_json_delta", "partial_json": 'es, CA"}'},
|
||||
},
|
||||
{"type": "content_block_stop", "index": 2},
|
||||
{"type": "message_delta", "delta": {"stop_reason": "tool_use", "stop_sequence": None},
|
||||
"usage": {"output_tokens": 137}},
|
||||
{"type": "message_stop"}
|
||||
{
|
||||
"type": "message_delta",
|
||||
"delta": {"stop_reason": "tool_use", "stop_sequence": None},
|
||||
"usage": {"output_tokens": 137},
|
||||
},
|
||||
{"type": "message_stop"},
|
||||
]
|
||||
|
||||
|
||||
|
@ -211,12 +353,12 @@ def test_anthropic_tool_streaming():
|
|||
correct_tool_index = -1
|
||||
for chunk in anthropic_chunk_list:
|
||||
parsed_chunk = response_iter.chunk_parser(chunk)
|
||||
if tool_use := parsed_chunk.get('tool_use'):
|
||||
if tool_use := parsed_chunk.get("tool_use"):
|
||||
|
||||
# We only increment when a new block starts
|
||||
if tool_use.get('id') is not None:
|
||||
if tool_use.get("id") is not None:
|
||||
correct_tool_index += 1
|
||||
assert tool_use['index'] == correct_tool_index
|
||||
assert tool_use["index"] == correct_tool_index
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
|
@ -344,4 +486,4 @@ def test_anthropic_tool_calling_translation():
|
|||
print(translated_params["messages"])
|
||||
|
||||
assert len(translated_params["messages"]) > 0
|
||||
assert translated_params["messages"][0]["role"] == "user"
|
||||
assert translated_params["messages"][0]["role"] == "user"
|
||||
|
|
|
@ -1097,3 +1097,73 @@ def test_completion_cost_azure_common_deployment_name():
|
|||
|
||||
print(f"mock_client.call_args: {mock_client.call_args.kwargs}")
|
||||
assert "azure/gpt-4" == mock_client.call_args.kwargs["model"]
|
||||
|
||||
|
||||
def test_completion_cost_anthropic_prompt_caching():
|
||||
os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
|
||||
litellm.model_cost = litellm.get_model_cost_map(url="")
|
||||
|
||||
from litellm.utils import Choices, Message, ModelResponse, Usage
|
||||
|
||||
model = "anthropic/claude-3-5-sonnet-20240620"
|
||||
|
||||
## WRITE TO CACHE ## (MORE EXPENSIVE)
|
||||
response_1 = ModelResponse(
|
||||
id="chatcmpl-3f427194-0840-4d08-b571-56bfe38a5424",
|
||||
choices=[
|
||||
Choices(
|
||||
finish_reason="length",
|
||||
index=0,
|
||||
message=Message(
|
||||
content="Hello! I'm doing well, thank you for",
|
||||
role="assistant",
|
||||
tool_calls=None,
|
||||
function_call=None,
|
||||
),
|
||||
)
|
||||
],
|
||||
created=1725036547,
|
||||
model="claude-3-5-sonnet-20240620",
|
||||
object="chat.completion",
|
||||
system_fingerprint=None,
|
||||
usage=Usage(
|
||||
completion_tokens=10,
|
||||
prompt_tokens=14,
|
||||
total_tokens=24,
|
||||
cache_creation_input_tokens=100,
|
||||
cache_read_input_tokens=0,
|
||||
),
|
||||
)
|
||||
|
||||
## READ FROM CACHE ## (LESS EXPENSIVE)
|
||||
response_2 = ModelResponse(
|
||||
id="chatcmpl-3f427194-0840-4d08-b571-56bfe38a5424",
|
||||
choices=[
|
||||
Choices(
|
||||
finish_reason="length",
|
||||
index=0,
|
||||
message=Message(
|
||||
content="Hello! I'm doing well, thank you for",
|
||||
role="assistant",
|
||||
tool_calls=None,
|
||||
function_call=None,
|
||||
),
|
||||
)
|
||||
],
|
||||
created=1725036547,
|
||||
model="claude-3-5-sonnet-20240620",
|
||||
object="chat.completion",
|
||||
system_fingerprint=None,
|
||||
usage=Usage(
|
||||
completion_tokens=10,
|
||||
prompt_tokens=14,
|
||||
total_tokens=24,
|
||||
cache_creation_input_tokens=0,
|
||||
cache_read_input_tokens=100,
|
||||
),
|
||||
)
|
||||
|
||||
cost_1 = completion_cost(model=model, completion_response=response_1)
|
||||
cost_2 = completion_cost(model=model, completion_response=response_2)
|
||||
|
||||
assert cost_1 > cost_2
|
||||
|
|
|
@ -290,6 +290,7 @@ async def test_base_case(dynamic_rate_limit_handler, mock_response):
|
|||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.flaky(retries=3, delay=1)
|
||||
async def test_update_cache(
|
||||
dynamic_rate_limit_handler, mock_response, user_api_key_auth
|
||||
):
|
||||
|
|
|
@ -75,6 +75,16 @@ def test_bedrock_optional_params_embeddings():
|
|||
assert len(optional_params) == 0
|
||||
|
||||
|
||||
def test_google_ai_studio_optional_params_embeddings():
|
||||
optional_params = get_optional_params_embeddings(
|
||||
user="John",
|
||||
encoding_format=None,
|
||||
custom_llm_provider="gemini",
|
||||
drop_params=True,
|
||||
)
|
||||
assert len(optional_params) == 0
|
||||
|
||||
|
||||
def test_openai_optional_params_embeddings():
|
||||
litellm.drop_params = True
|
||||
optional_params = get_optional_params_embeddings(
|
||||
|
|
|
@ -51,6 +51,8 @@ class ModelInfo(TypedDict, total=False):
|
|||
max_input_tokens: Required[Optional[int]]
|
||||
max_output_tokens: Required[Optional[int]]
|
||||
input_cost_per_token: Required[float]
|
||||
cache_creation_input_token_cost: Optional[float]
|
||||
cache_read_input_token_cost: Optional[float]
|
||||
input_cost_per_character: Optional[float] # only for vertex ai models
|
||||
input_cost_per_token_above_128k_tokens: Optional[float] # only for vertex ai models
|
||||
input_cost_per_character_above_128k_tokens: Optional[
|
||||
|
@ -454,6 +456,13 @@ class Choices(OpenAIObject):
|
|||
|
||||
|
||||
class Usage(CompletionUsage):
|
||||
_cache_creation_input_tokens: int = PrivateAttr(
|
||||
0
|
||||
) # hidden param for prompt caching. Might change, once openai introduces their equivalent.
|
||||
_cache_read_input_tokens: int = PrivateAttr(
|
||||
0
|
||||
) # hidden param for prompt caching. Might change, once openai introduces their equivalent.
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
prompt_tokens: Optional[int] = None,
|
||||
|
@ -466,9 +475,18 @@ class Usage(CompletionUsage):
|
|||
"completion_tokens": completion_tokens or 0,
|
||||
"total_tokens": total_tokens or 0,
|
||||
}
|
||||
|
||||
super().__init__(**data)
|
||||
|
||||
if "cache_creation_input_tokens" in params and isinstance(
|
||||
params["cache_creation_input_tokens"], int
|
||||
):
|
||||
self._cache_creation_input_tokens = params["cache_creation_input_tokens"]
|
||||
|
||||
if "cache_read_input_tokens" in params and isinstance(
|
||||
params["cache_read_input_tokens"], int
|
||||
):
|
||||
self._cache_read_input_tokens = params["cache_read_input_tokens"]
|
||||
|
||||
for k, v in params.items():
|
||||
setattr(self, k, v)
|
||||
|
||||
|
|
|
@ -2550,6 +2550,7 @@ def get_optional_params_embeddings(
|
|||
encoding_format=None,
|
||||
dimensions=None,
|
||||
custom_llm_provider="",
|
||||
drop_params: Optional[bool] = None,
|
||||
additional_drop_params: Optional[bool] = None,
|
||||
**kwargs,
|
||||
):
|
||||
|
@ -2560,6 +2561,7 @@ def get_optional_params_embeddings(
|
|||
for k, v in special_params.items():
|
||||
passed_params[k] = v
|
||||
|
||||
drop_params = passed_params.pop("drop_params", None)
|
||||
additional_drop_params = passed_params.pop("additional_drop_params", None)
|
||||
|
||||
default_params = {"user": None, "encoding_format": None, "dimensions": None}
|
||||
|
@ -2571,11 +2573,16 @@ def get_optional_params_embeddings(
|
|||
for k in non_default_params.keys():
|
||||
if k not in supported_params:
|
||||
unsupported_params[k] = non_default_params[k]
|
||||
if unsupported_params and not litellm.drop_params:
|
||||
raise UnsupportedParamsError(
|
||||
status_code=500,
|
||||
message=f"{custom_llm_provider} does not support parameters: {unsupported_params}, for model={model}. To drop these, set `litellm.drop_params=True` or for proxy:\n\n`litellm_settings:\n drop_params: true`\n",
|
||||
)
|
||||
if unsupported_params:
|
||||
if litellm.drop_params is True or (
|
||||
drop_params is not None and drop_params is True
|
||||
):
|
||||
pass
|
||||
else:
|
||||
raise UnsupportedParamsError(
|
||||
status_code=500,
|
||||
message=f"{custom_llm_provider} does not support parameters: {unsupported_params}, for model={model}. To drop these, set `litellm.drop_params=True` or for proxy:\n\n`litellm_settings:\n drop_params: true`\n",
|
||||
)
|
||||
|
||||
non_default_params = _get_non_default_params(
|
||||
passed_params=passed_params,
|
||||
|
@ -2680,7 +2687,9 @@ def get_optional_params_embeddings(
|
|||
and custom_llm_provider not in litellm.openai_compatible_providers
|
||||
):
|
||||
if len(non_default_params.keys()) > 0:
|
||||
if litellm.drop_params is True: # drop the unsupported non-default values
|
||||
if (
|
||||
litellm.drop_params is True or drop_params is True
|
||||
): # drop the unsupported non-default values
|
||||
keys = list(non_default_params.keys())
|
||||
for k in keys:
|
||||
non_default_params.pop(k, None)
|
||||
|
@ -5335,6 +5344,12 @@ def get_model_info(model: str, custom_llm_provider: Optional[str] = None) -> Mod
|
|||
max_input_tokens=_model_info.get("max_input_tokens", None),
|
||||
max_output_tokens=_model_info.get("max_output_tokens", None),
|
||||
input_cost_per_token=_input_cost_per_token,
|
||||
cache_creation_input_token_cost=_model_info.get(
|
||||
"cache_creation_input_token_cost", None
|
||||
),
|
||||
cache_read_input_token_cost=_model_info.get(
|
||||
"cache_read_input_token_cost", None
|
||||
),
|
||||
input_cost_per_character=_model_info.get(
|
||||
"input_cost_per_character", None
|
||||
),
|
||||
|
|
|
@ -1336,6 +1336,8 @@
|
|||
"max_output_tokens": 4096,
|
||||
"input_cost_per_token": 0.00000025,
|
||||
"output_cost_per_token": 0.00000125,
|
||||
"cache_creation_input_token_cost": 0.0000003,
|
||||
"cache_read_input_token_cost": 0.00000003,
|
||||
"litellm_provider": "anthropic",
|
||||
"mode": "chat",
|
||||
"supports_function_calling": true,
|
||||
|
@ -1349,6 +1351,8 @@
|
|||
"max_output_tokens": 4096,
|
||||
"input_cost_per_token": 0.000015,
|
||||
"output_cost_per_token": 0.000075,
|
||||
"cache_creation_input_token_cost": 0.00001875,
|
||||
"cache_read_input_token_cost": 0.0000015,
|
||||
"litellm_provider": "anthropic",
|
||||
"mode": "chat",
|
||||
"supports_function_calling": true,
|
||||
|
@ -1375,6 +1379,8 @@
|
|||
"max_output_tokens": 4096,
|
||||
"input_cost_per_token": 0.000003,
|
||||
"output_cost_per_token": 0.000015,
|
||||
"cache_creation_input_token_cost": 0.00000375,
|
||||
"cache_read_input_token_cost": 0.0000003,
|
||||
"litellm_provider": "anthropic",
|
||||
"mode": "chat",
|
||||
"supports_function_calling": true,
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue