From 6ab601432b2d6113725544f08702ff56005fc162 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Sat, 31 Aug 2024 10:05:23 -0700 Subject: [PATCH 1/4] feat prometheus add metric for failure / model --- litellm/integrations/prometheus.py | 19 ++++++++++++++++++- litellm/proxy/proxy_config.yaml | 15 +++------------ 2 files changed, 21 insertions(+), 13 deletions(-) diff --git a/litellm/integrations/prometheus.py b/litellm/integrations/prometheus.py index 642a776b6..ed5035074 100644 --- a/litellm/integrations/prometheus.py +++ b/litellm/integrations/prometheus.py @@ -138,6 +138,13 @@ class PrometheusLogger(CustomLogger): labelnames=["hashed_api_key", "api_key_alias", "model"], ) + # New metric for tracking error codes and models + self.litellm_error_code_metric = Counter( + "litellm_error_code_metric", + "Total number of errors by error code and model", + labelnames=["error_code", "model"], + ) + # Litellm-Enterprise Metrics if premium_user is True: @@ -378,7 +385,7 @@ class PrometheusLogger(CustomLogger): from litellm.proxy.proxy_server import premium_user verbose_logger.debug( - f"prometheus Logging - Enters success logging function for kwargs {kwargs}" + f"prometheus Logging - Enters failure logging function for kwargs {kwargs}" ) # unpack kwargs @@ -409,6 +416,16 @@ class PrometheusLogger(CustomLogger): user_id, ).inc() self.set_llm_deployment_failure_metrics(kwargs) + + _exception = kwargs.get("exception", None) + error_code = "unknown" + if _exception is not None and hasattr(_exception, "status_code"): + error_code = _exception.status_code + + # Increment the new error code metric + self.litellm_error_code_metric.labels( + error_code=error_code, model=model + ).inc() except Exception as e: verbose_logger.exception( "prometheus Layer Error(): Exception occured - {}".format(str(e)) diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml index c5f736bac..e97833421 100644 --- a/litellm/proxy/proxy_config.yaml +++ b/litellm/proxy/proxy_config.yaml @@ -3,21 +3,12 @@ model_list: litellm_params: model: openai/fake api_key: fake-key - api_base: https://exampleopenaiendpoint-production.up.railway.app/ - tags: ["teamA"] # 👈 Key Change model_info: id: "team-a-model" # used for identifying model in response headers - - model_name: fake-openai-endpoint - litellm_params: - model: openai/fake - api_key: fake-key - api_base: https://exampleopenaiendpoint-production.up.railway.app/ - tags: ["teamB"] # 👈 Key Change - model_info: - id: "team-b-model" # used for identifying model in response headers -router_settings: - enable_tag_filtering: True # 👈 Key Change +litellm_settings: + success_callback: ["prometheus"] + failure_callback: ["prometheus"] general_settings: master_key: sk-1234 \ No newline at end of file From e6faaba56e83585995dd0b6ce8ac89c29d1f12c7 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Sat, 31 Aug 2024 14:46:41 -0700 Subject: [PATCH 2/4] docs add litellm_error_code_metric_total --- docs/my-website/docs/proxy/prometheus.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/docs/my-website/docs/proxy/prometheus.md b/docs/my-website/docs/proxy/prometheus.md index 10e6456c2..6ccf0e44e 100644 --- a/docs/my-website/docs/proxy/prometheus.md +++ b/docs/my-website/docs/proxy/prometheus.md @@ -58,6 +58,15 @@ http://localhost:4000/metrics ## 📈 Metrics Tracked +### Error Metrics + +| Metric Name | Description | +|----------------------|--------------------------------------| +| `litellm_error_code_metric_total` | Total number of errors by error code and model | + +This metric provides a count of errors encountered, categorized by error code and model. For example: + + ### Proxy Requests / Spend Metrics @@ -66,7 +75,12 @@ http://localhost:4000/metrics | `litellm_requests_metric` | Number of requests made, per `"user", "key", "model", "team", "end-user"` | | `litellm_spend_metric` | Total Spend, per `"user", "key", "model", "team", "end-user"` | | `litellm_total_tokens` | input + output tokens per `"user", "key", "model", "team", "end-user"` | + +### Error Monitoring Metrics + +| Metric Name | Description | | `litellm_llm_api_failed_requests_metric` | Number of failed LLM API requests per `"user", "key", "model", "team", "end-user"` | +| `litellm_error_code_metric_total` | Total number of errors by error code and model | ### Request Latency Metrics From 65a9c933adaae9daca0a59e5bd1ac42dd5be2a41 Mon Sep 17 00:00:00 2001 From: Krish Dholakia Date: Sat, 31 Aug 2024 14:09:35 -0700 Subject: [PATCH 3/4] anthropic prompt caching cost tracking (#5453) * fix(utils.py): support 'drop_params' for embedding requests Fixes https://github.com/BerriAI/litellm/issues/5444 * feat(anthropic/cost_calculation.py): Support calculating cost for prompt caching on anthropic * feat(types/utils.py): allows us to migrate to openai's equivalent, once that comes out * fix: fix linting errors * test: mark flaky test --- litellm/__init__.py | 4 +- litellm/cost_calculator.py | 42 +++- .../llms/{anthropic.py => anthropic/chat.py} | 10 +- .../completion.py} | 8 +- litellm/llms/anthropic/cost_calculation.py | 42 ++++ litellm/llms/base.py | 13 +- .../vertex_ai_anthropic.py | 13 +- litellm/main.py | 10 +- ...odel_prices_and_context_window_backup.json | 6 + litellm/proxy/_new_secret_config.yaml | 4 +- litellm/tests/test_anthropic_completion.py | 230 ++++++++++++++---- litellm/tests/test_completion_cost.py | 70 ++++++ .../tests/test_dynamic_rate_limit_handler.py | 1 + litellm/tests/test_optional_params.py | 10 + litellm/types/utils.py | 20 +- litellm/utils.py | 27 +- model_prices_and_context_window.json | 6 + 17 files changed, 432 insertions(+), 84 deletions(-) rename litellm/llms/{anthropic.py => anthropic/chat.py} (99%) rename litellm/llms/{anthropic_text.py => anthropic/completion.py} (98%) create mode 100644 litellm/llms/anthropic/cost_calculation.py diff --git a/litellm/__init__.py b/litellm/__init__.py index 1c3b8434f..0436e039c 100644 --- a/litellm/__init__.py +++ b/litellm/__init__.py @@ -841,10 +841,10 @@ ALL_LITELLM_RESPONSE_TYPES = [ from .types.utils import ImageObject from .llms.custom_llm import CustomLLM from .llms.huggingface_restapi import HuggingfaceConfig -from .llms.anthropic import AnthropicConfig +from .llms.anthropic.chat import AnthropicConfig +from .llms.anthropic.completion import AnthropicTextConfig from .llms.databricks import DatabricksConfig, DatabricksEmbeddingConfig from .llms.predibase import PredibaseConfig -from .llms.anthropic_text import AnthropicTextConfig from .llms.replicate import ReplicateConfig from .llms.cohere.completion import CohereConfig from .llms.clarifai import ClarifaiConfig diff --git a/litellm/cost_calculator.py b/litellm/cost_calculator.py index 3c025055e..a0645c19a 100644 --- a/litellm/cost_calculator.py +++ b/litellm/cost_calculator.py @@ -19,8 +19,12 @@ from litellm.litellm_core_utils.llm_cost_calc.google import ( cost_router as google_cost_router, ) from litellm.litellm_core_utils.llm_cost_calc.utils import _generic_cost_per_character +from litellm.llms.anthropic.cost_calculation import ( + cost_per_token as anthropic_cost_per_token, +) from litellm.types.llms.openai import HttpxBinaryResponseContent from litellm.types.router import SPECIAL_MODEL_INFO_PARAMS +from litellm.types.utils import Usage from litellm.utils import ( CallTypes, CostPerToken, @@ -59,14 +63,17 @@ def _cost_per_token_custom_pricing_helper( def cost_per_token( model: str = "", - prompt_tokens: float = 0, - completion_tokens: float = 0, + prompt_tokens: int = 0, + completion_tokens: int = 0, response_time_ms=None, custom_llm_provider: Optional[str] = None, region_name=None, ### CHARACTER PRICING ### - prompt_characters: float = 0, - completion_characters: float = 0, + prompt_characters: int = 0, + completion_characters: int = 0, + ### PROMPT CACHING PRICING ### - used for anthropic + cache_creation_input_tokens: Optional[int] = 0, + cache_read_input_tokens: Optional[int] = 0, ### CUSTOM PRICING ### custom_cost_per_token: Optional[CostPerToken] = None, custom_cost_per_second: Optional[float] = None, @@ -108,6 +115,16 @@ def cost_per_token( """ if model is None: raise Exception("Invalid arg. Model cannot be none.") + + ## RECONSTRUCT USAGE BLOCK ## + usage_block = Usage( + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + total_tokens=prompt_tokens + completion_tokens, + cache_creation_input_tokens=cache_creation_input_tokens, + cache_read_input_tokens=cache_read_input_tokens, + ) + ## CUSTOM PRICING ## response_cost = _cost_per_token_custom_pricing_helper( prompt_tokens=prompt_tokens, @@ -137,6 +154,7 @@ def cost_per_token( model_with_provider = model_with_provider_and_region else: _, custom_llm_provider, _, _ = litellm.get_llm_provider(model=model) + model_without_prefix = model model_parts = model.split("/") if len(model_parts) > 1: @@ -162,6 +180,7 @@ def cost_per_token( # see this https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models print_verbose(f"Looking up model={model} in model_cost_map") + if custom_llm_provider == "vertex_ai": cost_router = google_cost_router( model=model_without_prefix, @@ -188,6 +207,8 @@ def cost_per_token( prompt_tokens=prompt_tokens, completion_tokens=completion_tokens, ) + elif custom_llm_provider == "anthropic": + return anthropic_cost_per_token(model=model, usage=usage_block) elif custom_llm_provider == "gemini": return google_cost_per_token( model=model_without_prefix, @@ -520,6 +541,8 @@ def completion_cost( prompt_characters = 0 completion_tokens = 0 completion_characters = 0 + cache_creation_input_tokens: Optional[int] = None + cache_read_input_tokens: Optional[int] = None if completion_response is not None and ( isinstance(completion_response, BaseModel) or isinstance(completion_response, dict) @@ -541,6 +564,13 @@ def completion_cost( completion_tokens = completion_response.get("usage", {}).get( "completion_tokens", 0 ) + cache_creation_input_tokens = completion_response.get("usage", {}).get( + "cache_creation_input_tokens", 0 + ) + cache_read_input_tokens = completion_response.get("usage", {}).get( + "cache_read_input_tokens", 0 + ) + total_time = getattr(completion_response, "_response_ms", 0) verbose_logger.debug( f"completion_response response ms: {getattr(completion_response, '_response_ms', None)} " @@ -550,7 +580,7 @@ def completion_cost( ) if hasattr(completion_response, "_hidden_params"): custom_llm_provider = completion_response._hidden_params.get( - "custom_llm_provider", custom_llm_provider or "" + "custom_llm_provider", custom_llm_provider or None ) region_name = completion_response._hidden_params.get( "region_name", region_name @@ -697,6 +727,8 @@ def completion_cost( custom_cost_per_token=custom_cost_per_token, prompt_characters=prompt_characters, completion_characters=completion_characters, + cache_creation_input_tokens=cache_creation_input_tokens, + cache_read_input_tokens=cache_read_input_tokens, call_type=call_type, ) _final_cost = prompt_tokens_cost_usd_dollar + completion_tokens_cost_usd_dollar diff --git a/litellm/llms/anthropic.py b/litellm/llms/anthropic/chat.py similarity index 99% rename from litellm/llms/anthropic.py rename to litellm/llms/anthropic/chat.py index 813897c66..f62c7246e 100644 --- a/litellm/llms/anthropic.py +++ b/litellm/llms/anthropic/chat.py @@ -1,3 +1,7 @@ +""" +Calling + translation logic for anthropic's `/v1/messages` endpoint +""" + import copy import json import os @@ -70,8 +74,8 @@ from litellm.types.llms.openai import ( from litellm.types.utils import Choices, GenericStreamingChunk from litellm.utils import CustomStreamWrapper, ModelResponse, Usage -from .base import BaseLLM -from .prompt_templates.factory import custom_prompt, prompt_factory +from ..base import BaseLLM +from ..prompt_templates.factory import custom_prompt, prompt_factory class AnthropicConstants(Enum): @@ -982,7 +986,7 @@ class AnthropicChatCompletion(BaseLLM): ) except Exception as e: verbose_logger.exception( - "litellm.llms.anthropic.py::completion() - Exception occurred - {}\nReceived Messages: {}".format( + "litellm.llms.anthropic.chat.py::completion() - Exception occurred - {}\nReceived Messages: {}".format( str(e), messages ) ) diff --git a/litellm/llms/anthropic_text.py b/litellm/llms/anthropic/completion.py similarity index 98% rename from litellm/llms/anthropic_text.py rename to litellm/llms/anthropic/completion.py index d20e49daf..dd2d47e53 100644 --- a/litellm/llms/anthropic_text.py +++ b/litellm/llms/anthropic/completion.py @@ -1,3 +1,7 @@ +""" +Translation logic for anthropic's `/v1/complete` endpoint +""" + import json import os import time @@ -12,8 +16,8 @@ import litellm from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler from litellm.utils import CustomStreamWrapper, ModelResponse, Usage -from .base import BaseLLM -from .prompt_templates.factory import custom_prompt, prompt_factory +from ..base import BaseLLM +from ..prompt_templates.factory import custom_prompt, prompt_factory class AnthropicConstants(Enum): diff --git a/litellm/llms/anthropic/cost_calculation.py b/litellm/llms/anthropic/cost_calculation.py new file mode 100644 index 000000000..d1742aae9 --- /dev/null +++ b/litellm/llms/anthropic/cost_calculation.py @@ -0,0 +1,42 @@ +""" +Helper util for handling anthropic-specific cost calculation +- e.g.: prompt caching +""" + +from typing import Tuple + +from litellm.types.utils import Usage +from litellm.utils import get_model_info + + +def cost_per_token(model: str, usage: Usage) -> Tuple[float, float]: + """ + Calculates the cost per token for a given model, prompt tokens, and completion tokens. + + Input: + - model: str, the model name without provider prefix + - usage: LiteLLM Usage block, containing anthropic caching information + + Returns: + Tuple[float, float] - prompt_cost_in_usd, completion_cost_in_usd + """ + ## GET MODEL INFO + model_info = get_model_info(model=model, custom_llm_provider="anthropic") + + ## CALCULATE INPUT COST + + prompt_cost: float = usage["prompt_tokens"] * model_info["input_cost_per_token"] + if model_info.get("cache_creation_input_token_cost") is not None: + prompt_cost += ( + usage._cache_creation_input_tokens # type: ignore + * model_info["cache_creation_input_token_cost"] + ) + if model_info.get("cache_read_input_token_cost") is not None: + prompt_cost += ( + usage._cache_read_input_tokens * model_info["cache_read_input_token_cost"] # type: ignore + ) + + ## CALCULATE OUTPUT COST + completion_cost = usage["completion_tokens"] * model_info["output_cost_per_token"] + + return prompt_cost, completion_cost diff --git a/litellm/llms/base.py b/litellm/llms/base.py index 7e80de9ab..08c5e1992 100644 --- a/litellm/llms/base.py +++ b/litellm/llms/base.py @@ -1,11 +1,14 @@ ## This is a template base class to be used for adding new LLM providers via API calls +from typing import Any, Optional, Union + +import httpx +import requests + import litellm -import httpx, requests -from typing import Optional, Union -from litellm.litellm_core_utils.litellm_logging import Logging class BaseLLM: + _client_session: Optional[httpx.Client] = None def process_response( @@ -14,7 +17,7 @@ class BaseLLM: response: Union[requests.Response, httpx.Response], model_response: litellm.utils.ModelResponse, stream: bool, - logging_obj: Logging, + logging_obj: Any, optional_params: dict, api_key: str, data: Union[dict, str], @@ -33,7 +36,7 @@ class BaseLLM: response: Union[requests.Response, httpx.Response], model_response: litellm.utils.TextCompletionResponse, stream: bool, - logging_obj: Logging, + logging_obj: Any, optional_params: dict, api_key: str, data: Union[dict, str], diff --git a/litellm/llms/vertex_ai_and_google_ai_studio/vertex_ai_anthropic.py b/litellm/llms/vertex_ai_and_google_ai_studio/vertex_ai_anthropic.py index e85160a43..025b27240 100644 --- a/litellm/llms/vertex_ai_and_google_ai_studio/vertex_ai_anthropic.py +++ b/litellm/llms/vertex_ai_and_google_ai_studio/vertex_ai_anthropic.py @@ -267,18 +267,19 @@ def completion( ): try: import vertexai - from anthropic import AnthropicVertex - - from litellm.llms.anthropic import AnthropicChatCompletion - from litellm.llms.vertex_ai_and_google_ai_studio.gemini.vertex_and_google_ai_studio_gemini import ( - VertexLLM, - ) except: raise VertexAIError( status_code=400, message="""vertexai import failed please run `pip install -U google-cloud-aiplatform "anthropic[vertex]"`""", ) + from anthropic import AnthropicVertex + + from litellm.llms.anthropic.chat import AnthropicChatCompletion + from litellm.llms.vertex_ai_and_google_ai_studio.gemini.vertex_and_google_ai_studio_gemini import ( + VertexLLM, + ) + if not ( hasattr(vertexai, "preview") or hasattr(vertexai.preview, "language_models") ): diff --git a/litellm/main.py b/litellm/main.py index 95a106377..f9ef4a419 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -77,13 +77,10 @@ from .caching import disable_cache, enable_cache, update_cache from .llms import ( ai21, aleph_alpha, - anthropic_text, baseten, bedrock, clarifai, cloudflare, - gemini, - huggingface_restapi, maritalk, nlp_cloud, ollama, @@ -93,13 +90,10 @@ from .llms import ( palm, petals, replicate, - together_ai, - triton, vllm, - watsonx, ) -from .llms.anthropic import AnthropicChatCompletion -from .llms.anthropic_text import AnthropicTextCompletion +from .llms.anthropic.chat import AnthropicChatCompletion +from .llms.anthropic.completion import AnthropicTextCompletion from .llms.azure import AzureChatCompletion, _check_dynamic_azure_params from .llms.azure_text import AzureTextCompletion from .llms.bedrock_httpx import BedrockConverseLLM, BedrockLLM diff --git a/litellm/model_prices_and_context_window_backup.json b/litellm/model_prices_and_context_window_backup.json index 5e6d0f2ab..a60743c65 100644 --- a/litellm/model_prices_and_context_window_backup.json +++ b/litellm/model_prices_and_context_window_backup.json @@ -1336,6 +1336,8 @@ "max_output_tokens": 4096, "input_cost_per_token": 0.00000025, "output_cost_per_token": 0.00000125, + "cache_creation_input_token_cost": 0.0000003, + "cache_read_input_token_cost": 0.00000003, "litellm_provider": "anthropic", "mode": "chat", "supports_function_calling": true, @@ -1349,6 +1351,8 @@ "max_output_tokens": 4096, "input_cost_per_token": 0.000015, "output_cost_per_token": 0.000075, + "cache_creation_input_token_cost": 0.00001875, + "cache_read_input_token_cost": 0.0000015, "litellm_provider": "anthropic", "mode": "chat", "supports_function_calling": true, @@ -1375,6 +1379,8 @@ "max_output_tokens": 4096, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000015, + "cache_creation_input_token_cost": 0.00000375, + "cache_read_input_token_cost": 0.0000003, "litellm_provider": "anthropic", "mode": "chat", "supports_function_calling": true, diff --git a/litellm/proxy/_new_secret_config.yaml b/litellm/proxy/_new_secret_config.yaml index b8f964ab3..b84ef7453 100644 --- a/litellm/proxy/_new_secret_config.yaml +++ b/litellm/proxy/_new_secret_config.yaml @@ -1,4 +1,4 @@ model_list: - - model_name: "gemini/*" + - model_name: "gpt-3.5-turbo" litellm_params: - model: "gemini/*" \ No newline at end of file + model: "gpt-3.5-turbo" \ No newline at end of file diff --git a/litellm/tests/test_anthropic_completion.py b/litellm/tests/test_anthropic_completion.py index b5e01d448..b8ccf716e 100644 --- a/litellm/tests/test_anthropic_completion.py +++ b/litellm/tests/test_anthropic_completion.py @@ -10,7 +10,7 @@ from dotenv import load_dotenv import litellm.types import litellm.types.utils -from litellm.llms.anthropic import ModelResponseIterator +from litellm.llms.anthropic.chat import ModelResponseIterator load_dotenv() import io @@ -152,48 +152,190 @@ def test_anthropic_completion_e2e(stream): anthropic_chunk_list = [ - {"type": "content_block_start", "index": 0, "content_block": {"type": "text", "text": ""}}, - {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": "To"}}, - {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " answer"}}, - {"type": "content_block_delta", "index": 0, - "delta": {"type": "text_delta", "text": " your question about the weather"}}, - {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " in Boston and Los"}}, - {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " Angeles today, I'll"}}, - {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " need to"}}, - {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " use"}}, - {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " the"}}, - {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " get_current_weather"}}, - {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " function"}}, - {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " for"}}, - {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " both"}}, - {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " cities"}}, - {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": ". Let"}}, - {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " me fetch"}}, - {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " that"}}, - {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " information"}}, - {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " for"}}, - {"type": "content_block_delta", "index": 0, "delta": {"type": "text_delta", "text": " you."}}, + { + "type": "content_block_start", + "index": 0, + "content_block": {"type": "text", "text": ""}, + }, + { + "type": "content_block_delta", + "index": 0, + "delta": {"type": "text_delta", "text": "To"}, + }, + { + "type": "content_block_delta", + "index": 0, + "delta": {"type": "text_delta", "text": " answer"}, + }, + { + "type": "content_block_delta", + "index": 0, + "delta": {"type": "text_delta", "text": " your question about the weather"}, + }, + { + "type": "content_block_delta", + "index": 0, + "delta": {"type": "text_delta", "text": " in Boston and Los"}, + }, + { + "type": "content_block_delta", + "index": 0, + "delta": {"type": "text_delta", "text": " Angeles today, I'll"}, + }, + { + "type": "content_block_delta", + "index": 0, + "delta": {"type": "text_delta", "text": " need to"}, + }, + { + "type": "content_block_delta", + "index": 0, + "delta": {"type": "text_delta", "text": " use"}, + }, + { + "type": "content_block_delta", + "index": 0, + "delta": {"type": "text_delta", "text": " the"}, + }, + { + "type": "content_block_delta", + "index": 0, + "delta": {"type": "text_delta", "text": " get_current_weather"}, + }, + { + "type": "content_block_delta", + "index": 0, + "delta": {"type": "text_delta", "text": " function"}, + }, + { + "type": "content_block_delta", + "index": 0, + "delta": {"type": "text_delta", "text": " for"}, + }, + { + "type": "content_block_delta", + "index": 0, + "delta": {"type": "text_delta", "text": " both"}, + }, + { + "type": "content_block_delta", + "index": 0, + "delta": {"type": "text_delta", "text": " cities"}, + }, + { + "type": "content_block_delta", + "index": 0, + "delta": {"type": "text_delta", "text": ". Let"}, + }, + { + "type": "content_block_delta", + "index": 0, + "delta": {"type": "text_delta", "text": " me fetch"}, + }, + { + "type": "content_block_delta", + "index": 0, + "delta": {"type": "text_delta", "text": " that"}, + }, + { + "type": "content_block_delta", + "index": 0, + "delta": {"type": "text_delta", "text": " information"}, + }, + { + "type": "content_block_delta", + "index": 0, + "delta": {"type": "text_delta", "text": " for"}, + }, + { + "type": "content_block_delta", + "index": 0, + "delta": {"type": "text_delta", "text": " you."}, + }, {"type": "content_block_stop", "index": 0}, - {"type": "content_block_start", "index": 1, - "content_block": {"type": "tool_use", "id": "toolu_12345", "name": "get_current_weather", "input": {}}}, - {"type": "content_block_delta", "index": 1, "delta": {"type": "input_json_delta", "partial_json": ""}}, - {"type": "content_block_delta", "index": 1, "delta": {"type": "input_json_delta", "partial_json": "{\"locat"}}, - {"type": "content_block_delta", "index": 1, "delta": {"type": "input_json_delta", "partial_json": "ion\": \"Bos"}}, - {"type": "content_block_delta", "index": 1, "delta": {"type": "input_json_delta", "partial_json": "ton, MA\"}"}}, + { + "type": "content_block_start", + "index": 1, + "content_block": { + "type": "tool_use", + "id": "toolu_12345", + "name": "get_current_weather", + "input": {}, + }, + }, + { + "type": "content_block_delta", + "index": 1, + "delta": {"type": "input_json_delta", "partial_json": ""}, + }, + { + "type": "content_block_delta", + "index": 1, + "delta": {"type": "input_json_delta", "partial_json": '{"locat'}, + }, + { + "type": "content_block_delta", + "index": 1, + "delta": {"type": "input_json_delta", "partial_json": 'ion": "Bos'}, + }, + { + "type": "content_block_delta", + "index": 1, + "delta": {"type": "input_json_delta", "partial_json": 'ton, MA"}'}, + }, {"type": "content_block_stop", "index": 1}, - {"type": "content_block_start", "index": 2, - "content_block": {"type": "tool_use", "id": "toolu_023423423", "name": "get_current_weather", "input": {}}}, - {"type": "content_block_delta", "index": 2, "delta": {"type": "input_json_delta", "partial_json": ""}}, - {"type": "content_block_delta", "index": 2, "delta": {"type": "input_json_delta", "partial_json": "{\"l"}}, - {"type": "content_block_delta", "index": 2, "delta": {"type": "input_json_delta", "partial_json": "oca"}}, - {"type": "content_block_delta", "index": 2, "delta": {"type": "input_json_delta", "partial_json": "tio"}}, - {"type": "content_block_delta", "index": 2, "delta": {"type": "input_json_delta", "partial_json": "n\": \"Lo"}}, - {"type": "content_block_delta", "index": 2, "delta": {"type": "input_json_delta", "partial_json": "s Angel"}}, - {"type": "content_block_delta", "index": 2, "delta": {"type": "input_json_delta", "partial_json": "es, CA\"}"}}, + { + "type": "content_block_start", + "index": 2, + "content_block": { + "type": "tool_use", + "id": "toolu_023423423", + "name": "get_current_weather", + "input": {}, + }, + }, + { + "type": "content_block_delta", + "index": 2, + "delta": {"type": "input_json_delta", "partial_json": ""}, + }, + { + "type": "content_block_delta", + "index": 2, + "delta": {"type": "input_json_delta", "partial_json": '{"l'}, + }, + { + "type": "content_block_delta", + "index": 2, + "delta": {"type": "input_json_delta", "partial_json": "oca"}, + }, + { + "type": "content_block_delta", + "index": 2, + "delta": {"type": "input_json_delta", "partial_json": "tio"}, + }, + { + "type": "content_block_delta", + "index": 2, + "delta": {"type": "input_json_delta", "partial_json": 'n": "Lo'}, + }, + { + "type": "content_block_delta", + "index": 2, + "delta": {"type": "input_json_delta", "partial_json": "s Angel"}, + }, + { + "type": "content_block_delta", + "index": 2, + "delta": {"type": "input_json_delta", "partial_json": 'es, CA"}'}, + }, {"type": "content_block_stop", "index": 2}, - {"type": "message_delta", "delta": {"stop_reason": "tool_use", "stop_sequence": None}, - "usage": {"output_tokens": 137}}, - {"type": "message_stop"} + { + "type": "message_delta", + "delta": {"stop_reason": "tool_use", "stop_sequence": None}, + "usage": {"output_tokens": 137}, + }, + {"type": "message_stop"}, ] @@ -211,12 +353,12 @@ def test_anthropic_tool_streaming(): correct_tool_index = -1 for chunk in anthropic_chunk_list: parsed_chunk = response_iter.chunk_parser(chunk) - if tool_use := parsed_chunk.get('tool_use'): + if tool_use := parsed_chunk.get("tool_use"): # We only increment when a new block starts - if tool_use.get('id') is not None: + if tool_use.get("id") is not None: correct_tool_index += 1 - assert tool_use['index'] == correct_tool_index + assert tool_use["index"] == correct_tool_index @pytest.mark.asyncio @@ -344,4 +486,4 @@ def test_anthropic_tool_calling_translation(): print(translated_params["messages"]) assert len(translated_params["messages"]) > 0 - assert translated_params["messages"][0]["role"] == "user" \ No newline at end of file + assert translated_params["messages"][0]["role"] == "user" diff --git a/litellm/tests/test_completion_cost.py b/litellm/tests/test_completion_cost.py index e9326752f..f48a85cad 100644 --- a/litellm/tests/test_completion_cost.py +++ b/litellm/tests/test_completion_cost.py @@ -1097,3 +1097,73 @@ def test_completion_cost_azure_common_deployment_name(): print(f"mock_client.call_args: {mock_client.call_args.kwargs}") assert "azure/gpt-4" == mock_client.call_args.kwargs["model"] + + +def test_completion_cost_anthropic_prompt_caching(): + os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True" + litellm.model_cost = litellm.get_model_cost_map(url="") + + from litellm.utils import Choices, Message, ModelResponse, Usage + + model = "anthropic/claude-3-5-sonnet-20240620" + + ## WRITE TO CACHE ## (MORE EXPENSIVE) + response_1 = ModelResponse( + id="chatcmpl-3f427194-0840-4d08-b571-56bfe38a5424", + choices=[ + Choices( + finish_reason="length", + index=0, + message=Message( + content="Hello! I'm doing well, thank you for", + role="assistant", + tool_calls=None, + function_call=None, + ), + ) + ], + created=1725036547, + model="claude-3-5-sonnet-20240620", + object="chat.completion", + system_fingerprint=None, + usage=Usage( + completion_tokens=10, + prompt_tokens=14, + total_tokens=24, + cache_creation_input_tokens=100, + cache_read_input_tokens=0, + ), + ) + + ## READ FROM CACHE ## (LESS EXPENSIVE) + response_2 = ModelResponse( + id="chatcmpl-3f427194-0840-4d08-b571-56bfe38a5424", + choices=[ + Choices( + finish_reason="length", + index=0, + message=Message( + content="Hello! I'm doing well, thank you for", + role="assistant", + tool_calls=None, + function_call=None, + ), + ) + ], + created=1725036547, + model="claude-3-5-sonnet-20240620", + object="chat.completion", + system_fingerprint=None, + usage=Usage( + completion_tokens=10, + prompt_tokens=14, + total_tokens=24, + cache_creation_input_tokens=0, + cache_read_input_tokens=100, + ), + ) + + cost_1 = completion_cost(model=model, completion_response=response_1) + cost_2 = completion_cost(model=model, completion_response=response_2) + + assert cost_1 > cost_2 diff --git a/litellm/tests/test_dynamic_rate_limit_handler.py b/litellm/tests/test_dynamic_rate_limit_handler.py index f49a760af..d711de71f 100644 --- a/litellm/tests/test_dynamic_rate_limit_handler.py +++ b/litellm/tests/test_dynamic_rate_limit_handler.py @@ -290,6 +290,7 @@ async def test_base_case(dynamic_rate_limit_handler, mock_response): @pytest.mark.asyncio +@pytest.mark.flaky(retries=3, delay=1) async def test_update_cache( dynamic_rate_limit_handler, mock_response, user_api_key_auth ): diff --git a/litellm/tests/test_optional_params.py b/litellm/tests/test_optional_params.py index e8bc999f2..54e2e5b43 100644 --- a/litellm/tests/test_optional_params.py +++ b/litellm/tests/test_optional_params.py @@ -75,6 +75,16 @@ def test_bedrock_optional_params_embeddings(): assert len(optional_params) == 0 +def test_google_ai_studio_optional_params_embeddings(): + optional_params = get_optional_params_embeddings( + user="John", + encoding_format=None, + custom_llm_provider="gemini", + drop_params=True, + ) + assert len(optional_params) == 0 + + def test_openai_optional_params_embeddings(): litellm.drop_params = True optional_params = get_optional_params_embeddings( diff --git a/litellm/types/utils.py b/litellm/types/utils.py index 81dc268af..aadbdd22a 100644 --- a/litellm/types/utils.py +++ b/litellm/types/utils.py @@ -51,6 +51,8 @@ class ModelInfo(TypedDict, total=False): max_input_tokens: Required[Optional[int]] max_output_tokens: Required[Optional[int]] input_cost_per_token: Required[float] + cache_creation_input_token_cost: Optional[float] + cache_read_input_token_cost: Optional[float] input_cost_per_character: Optional[float] # only for vertex ai models input_cost_per_token_above_128k_tokens: Optional[float] # only for vertex ai models input_cost_per_character_above_128k_tokens: Optional[ @@ -454,6 +456,13 @@ class Choices(OpenAIObject): class Usage(CompletionUsage): + _cache_creation_input_tokens: int = PrivateAttr( + 0 + ) # hidden param for prompt caching. Might change, once openai introduces their equivalent. + _cache_read_input_tokens: int = PrivateAttr( + 0 + ) # hidden param for prompt caching. Might change, once openai introduces their equivalent. + def __init__( self, prompt_tokens: Optional[int] = None, @@ -466,9 +475,18 @@ class Usage(CompletionUsage): "completion_tokens": completion_tokens or 0, "total_tokens": total_tokens or 0, } - super().__init__(**data) + if "cache_creation_input_tokens" in params and isinstance( + params["cache_creation_input_tokens"], int + ): + self._cache_creation_input_tokens = params["cache_creation_input_tokens"] + + if "cache_read_input_tokens" in params and isinstance( + params["cache_read_input_tokens"], int + ): + self._cache_read_input_tokens = params["cache_read_input_tokens"] + for k, v in params.items(): setattr(self, k, v) diff --git a/litellm/utils.py b/litellm/utils.py index ec4ac79c0..bb50900d0 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -2550,6 +2550,7 @@ def get_optional_params_embeddings( encoding_format=None, dimensions=None, custom_llm_provider="", + drop_params: Optional[bool] = None, additional_drop_params: Optional[bool] = None, **kwargs, ): @@ -2560,6 +2561,7 @@ def get_optional_params_embeddings( for k, v in special_params.items(): passed_params[k] = v + drop_params = passed_params.pop("drop_params", None) additional_drop_params = passed_params.pop("additional_drop_params", None) default_params = {"user": None, "encoding_format": None, "dimensions": None} @@ -2571,11 +2573,16 @@ def get_optional_params_embeddings( for k in non_default_params.keys(): if k not in supported_params: unsupported_params[k] = non_default_params[k] - if unsupported_params and not litellm.drop_params: - raise UnsupportedParamsError( - status_code=500, - message=f"{custom_llm_provider} does not support parameters: {unsupported_params}, for model={model}. To drop these, set `litellm.drop_params=True` or for proxy:\n\n`litellm_settings:\n drop_params: true`\n", - ) + if unsupported_params: + if litellm.drop_params is True or ( + drop_params is not None and drop_params is True + ): + pass + else: + raise UnsupportedParamsError( + status_code=500, + message=f"{custom_llm_provider} does not support parameters: {unsupported_params}, for model={model}. To drop these, set `litellm.drop_params=True` or for proxy:\n\n`litellm_settings:\n drop_params: true`\n", + ) non_default_params = _get_non_default_params( passed_params=passed_params, @@ -2680,7 +2687,9 @@ def get_optional_params_embeddings( and custom_llm_provider not in litellm.openai_compatible_providers ): if len(non_default_params.keys()) > 0: - if litellm.drop_params is True: # drop the unsupported non-default values + if ( + litellm.drop_params is True or drop_params is True + ): # drop the unsupported non-default values keys = list(non_default_params.keys()) for k in keys: non_default_params.pop(k, None) @@ -5335,6 +5344,12 @@ def get_model_info(model: str, custom_llm_provider: Optional[str] = None) -> Mod max_input_tokens=_model_info.get("max_input_tokens", None), max_output_tokens=_model_info.get("max_output_tokens", None), input_cost_per_token=_input_cost_per_token, + cache_creation_input_token_cost=_model_info.get( + "cache_creation_input_token_cost", None + ), + cache_read_input_token_cost=_model_info.get( + "cache_read_input_token_cost", None + ), input_cost_per_character=_model_info.get( "input_cost_per_character", None ), diff --git a/model_prices_and_context_window.json b/model_prices_and_context_window.json index 5e6d0f2ab..a60743c65 100644 --- a/model_prices_and_context_window.json +++ b/model_prices_and_context_window.json @@ -1336,6 +1336,8 @@ "max_output_tokens": 4096, "input_cost_per_token": 0.00000025, "output_cost_per_token": 0.00000125, + "cache_creation_input_token_cost": 0.0000003, + "cache_read_input_token_cost": 0.00000003, "litellm_provider": "anthropic", "mode": "chat", "supports_function_calling": true, @@ -1349,6 +1351,8 @@ "max_output_tokens": 4096, "input_cost_per_token": 0.000015, "output_cost_per_token": 0.000075, + "cache_creation_input_token_cost": 0.00001875, + "cache_read_input_token_cost": 0.0000015, "litellm_provider": "anthropic", "mode": "chat", "supports_function_calling": true, @@ -1375,6 +1379,8 @@ "max_output_tokens": 4096, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000015, + "cache_creation_input_token_cost": 0.00000375, + "cache_read_input_token_cost": 0.0000003, "litellm_provider": "anthropic", "mode": "chat", "supports_function_calling": true, From 336022e97c5400f6c38950a540fda08b114161dc Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Sat, 31 Aug 2024 14:34:00 -0700 Subject: [PATCH 4/4] test: skip test on end of life model --- litellm/tests/test_streaming.py | 1 + 1 file changed, 1 insertion(+) diff --git a/litellm/tests/test_streaming.py b/litellm/tests/test_streaming.py index d2ef8aafc..1b8b4e085 100644 --- a/litellm/tests/test_streaming.py +++ b/litellm/tests/test_streaming.py @@ -1545,6 +1545,7 @@ def test_completion_bedrock_claude_stream(): # test_completion_bedrock_claude_stream() +@pytest.mark.skip(reason="model end of life") def test_completion_bedrock_ai21_stream(): try: litellm.set_verbose = False