diff --git a/litellm/constants.py b/litellm/constants.py index da66f897c9..d6b5d1e08a 100644 --- a/litellm/constants.py +++ b/litellm/constants.py @@ -18,9 +18,13 @@ MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB = 1024 # 1MB = 1024KB SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD = 1000 # Minimum number of requests to consider "reasonable traffic". Used for single-deployment cooldown logic. #### RELIABILITY #### REPEATED_STREAMING_CHUNK_LIMIT = 100 # catch if model starts looping the same chunk while streaming. Uses high default to prevent false positives. +DEFAULT_MAX_LRU_CACHE_SIZE = 16 #### Networking settings #### request_timeout: float = 6000 # time in seconds STREAM_SSE_DONE_STRING: str = "[DONE]" +### SPEND TRACKING ### +DEFAULT_REPLICATE_GPU_PRICE_PER_SECOND = 0.001400 # price per second for a100 80GB + LITELLM_CHAT_PROVIDERS = [ "openai", diff --git a/litellm/cost_calculator.py b/litellm/cost_calculator.py index 55736772af..8956e0f40e 100644 --- a/litellm/cost_calculator.py +++ b/litellm/cost_calculator.py @@ -9,6 +9,10 @@ from pydantic import BaseModel import litellm import litellm._logging from litellm import verbose_logger +from litellm.constants import ( + DEFAULT_MAX_LRU_CACHE_SIZE, + DEFAULT_REPLICATE_GPU_PRICE_PER_SECOND, +) from litellm.litellm_core_utils.llm_cost_calc.tool_call_cost_tracking import ( StandardBuiltInToolCostTracking, ) @@ -357,9 +361,7 @@ def cost_per_token( # noqa: PLR0915 def get_replicate_completion_pricing(completion_response: dict, total_time=0.0): # see https://replicate.com/pricing # for all litellm currently supported LLMs, almost all requests go to a100_80gb - a100_80gb_price_per_second_public = ( - 0.001400 # assume all calls sent to A100 80GB for now - ) + a100_80gb_price_per_second_public = DEFAULT_REPLICATE_GPU_PRICE_PER_SECOND # assume all calls sent to A100 80GB for now if total_time == 0.0: # total time is in ms start_time = completion_response.get("created", time.time()) end_time = getattr(completion_response, "ended", time.time()) @@ -452,7 +454,7 @@ def _select_model_name_for_cost_calc( return return_model -@lru_cache(maxsize=16) +@lru_cache(maxsize=DEFAULT_MAX_LRU_CACHE_SIZE) def _model_contains_known_llm_provider(model: str) -> bool: """ Check if the model contains a known llm provider diff --git a/tests/code_coverage_tests/ban_constant_numbers.py b/tests/code_coverage_tests/ban_constant_numbers.py index 40e7139ae9..ea31a8b051 100644 --- a/tests/code_coverage_tests/ban_constant_numbers.py +++ b/tests/code_coverage_tests/ban_constant_numbers.py @@ -3,7 +3,7 @@ import ast import os # Extremely restrictive set of allowed numbers -ALLOWED_NUMBERS = {0, 1, -1, 2, 10, 100} +ALLOWED_NUMBERS = {0, 1, -1, 2, 10, 100, 1000} class HardcodedNumberFinder(ast.NodeVisitor):