diff --git a/litellm/__init__.py b/litellm/__init__.py index 9997b9a8ac..42a96abf13 100644 --- a/litellm/__init__.py +++ b/litellm/__init__.py @@ -56,6 +56,9 @@ from litellm.constants import ( bedrock_embedding_models, known_tokenizer_config, BEDROCK_INVOKE_PROVIDERS_LITERAL, + DEFAULT_MAX_TOKENS, + DEFAULT_SOFT_BUDGET, + DEFAULT_ALLOWED_FAILS, ) from litellm.types.guardrails import GuardrailItem from litellm.proxy._types import ( @@ -155,7 +158,7 @@ token: Optional[ str ] = None # Not used anymore, will be removed in next MAJOR release - https://github.com/BerriAI/litellm/discussions/648 telemetry = True -max_tokens = 256 # OpenAI Defaults +max_tokens: int = DEFAULT_MAX_TOKENS # OpenAI Defaults drop_params = bool(os.getenv("LITELLM_DROP_PARAMS", False)) modify_params = False retry = True @@ -244,7 +247,7 @@ budget_duration: Optional[ str ] = None # proxy only - resets budget after fixed duration. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d"). default_soft_budget: float = ( - 50.0 # by default all litellm proxy keys have a soft budget of 50.0 + DEFAULT_SOFT_BUDGET # by default all litellm proxy keys have a soft budget of 50.0 ) forward_traceparent_to_llm_provider: bool = False diff --git a/litellm/_redis.py b/litellm/_redis.py index b2624d4280..14813c436e 100644 --- a/litellm/_redis.py +++ b/litellm/_redis.py @@ -18,6 +18,7 @@ import redis # type: ignore import redis.asyncio as async_redis # type: ignore from litellm import get_secret, get_secret_str +from litellm.constants import REDIS_CONNECTION_POOL_TIMEOUT, REDIS_SOCKET_TIMEOUT from ._logging import verbose_logger @@ -215,7 +216,7 @@ def _init_redis_sentinel(redis_kwargs) -> redis.Redis: # Set up the Sentinel client sentinel = redis.Sentinel( sentinel_nodes, - socket_timeout=0.1, + socket_timeout=REDIS_SOCKET_TIMEOUT, password=sentinel_password, ) @@ -239,7 +240,7 @@ def _init_async_redis_sentinel(redis_kwargs) -> async_redis.Redis: # Set up the Sentinel client sentinel = async_redis.Sentinel( sentinel_nodes, - socket_timeout=0.1, + socket_timeout=REDIS_SOCKET_TIMEOUT, password=sentinel_password, ) @@ -319,7 +320,7 @@ def get_redis_connection_pool(**env_overrides): verbose_logger.debug("get_redis_connection_pool: redis_kwargs", redis_kwargs) if "url" in redis_kwargs and redis_kwargs["url"] is not None: return async_redis.BlockingConnectionPool.from_url( - timeout=5, url=redis_kwargs["url"] + timeout=REDIS_CONNECTION_POOL_TIMEOUT, url=redis_kwargs["url"] ) connection_class = async_redis.Connection if "ssl" in redis_kwargs: @@ -327,4 +328,6 @@ def get_redis_connection_pool(**env_overrides): redis_kwargs.pop("ssl", None) redis_kwargs["connection_class"] = connection_class redis_kwargs.pop("startup_nodes", None) - return async_redis.BlockingConnectionPool(timeout=5, **redis_kwargs) + return async_redis.BlockingConnectionPool( + timeout=REDIS_CONNECTION_POOL_TIMEOUT, **redis_kwargs + ) diff --git a/litellm/budget_manager.py b/litellm/budget_manager.py index e664c4f44f..b25967579e 100644 --- a/litellm/budget_manager.py +++ b/litellm/budget_manager.py @@ -14,6 +14,12 @@ import time from typing import Literal, Optional import litellm +from litellm.constants import ( + DAYS_IN_A_MONTH, + DAYS_IN_A_WEEK, + DAYS_IN_A_YEAR, + HOURS_IN_A_DAY, +) from litellm.utils import ModelResponse @@ -81,11 +87,11 @@ class BudgetManager: if duration == "daily": duration_in_days = 1 elif duration == "weekly": - duration_in_days = 7 + duration_in_days = DAYS_IN_A_WEEK elif duration == "monthly": - duration_in_days = 28 + duration_in_days = DAYS_IN_A_MONTH elif duration == "yearly": - duration_in_days = 365 + duration_in_days = DAYS_IN_A_YEAR else: raise ValueError( """duration needs to be one of ["daily", "weekly", "monthly", "yearly"]""" @@ -182,7 +188,9 @@ class BudgetManager: current_time = time.time() # Convert duration from days to seconds - duration_in_seconds = self.user_dict[user]["duration"] * 24 * 60 * 60 + duration_in_seconds = ( + self.user_dict[user]["duration"] * HOURS_IN_A_DAY * 60 * 60 + ) # Check if duration has elapsed if current_time - last_updated_at >= duration_in_seconds: diff --git a/litellm/caching/caching.py b/litellm/caching/caching.py index affb8e3855..6a7c93e3fe 100644 --- a/litellm/caching/caching.py +++ b/litellm/caching/caching.py @@ -19,6 +19,7 @@ from pydantic import BaseModel import litellm from litellm._logging import verbose_logger +from litellm.constants import CACHED_STREAMING_CHUNK_DELAY from litellm.litellm_core_utils.model_param_helper import ModelParamHelper from litellm.types.caching import * from litellm.types.utils import all_litellm_params @@ -406,7 +407,7 @@ class Cache: } ] } - time.sleep(0.02) + time.sleep(CACHED_STREAMING_CHUNK_DELAY) def _get_cache_logic( self, diff --git a/litellm/caching/in_memory_cache.py b/litellm/caching/in_memory_cache.py index 5e09fe845f..e3d757d08d 100644 --- a/litellm/caching/in_memory_cache.py +++ b/litellm/caching/in_memory_cache.py @@ -15,7 +15,8 @@ from typing import Any, List, Optional from pydantic import BaseModel -from ..constants import MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB +from litellm.constants import MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB + from .base_cache import BaseCache @@ -52,7 +53,8 @@ class InMemoryCache(BaseCache): # Fast path for common primitive types that are typically small if ( isinstance(value, (bool, int, float, str)) - and len(str(value)) < self.max_size_per_item * 512 + and len(str(value)) + < self.max_size_per_item * MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB ): # Conservative estimate return True diff --git a/litellm/caching/qdrant_semantic_cache.py b/litellm/caching/qdrant_semantic_cache.py index bdfd3770ae..32d4d8b0fd 100644 --- a/litellm/caching/qdrant_semantic_cache.py +++ b/litellm/caching/qdrant_semantic_cache.py @@ -11,10 +11,12 @@ Has 4 methods: import ast import asyncio import json -from typing import Any +from typing import Any, cast import litellm from litellm._logging import print_verbose +from litellm.constants import QDRANT_SCALAR_QUANTILE, QDRANT_VECTOR_SIZE +from litellm.types.utils import EmbeddingResponse from .base_cache import BaseCache @@ -118,7 +120,11 @@ class QdrantSemanticCache(BaseCache): } elif quantization_config == "scalar": quantization_params = { - "scalar": {"type": "int8", "quantile": 0.99, "always_ram": False} + "scalar": { + "type": "int8", + "quantile": QDRANT_SCALAR_QUANTILE, + "always_ram": False, + } } elif quantization_config == "product": quantization_params = { @@ -132,7 +138,7 @@ class QdrantSemanticCache(BaseCache): new_collection_status = self.sync_client.put( url=f"{self.qdrant_api_base}/collections/{self.collection_name}", json={ - "vectors": {"size": 1536, "distance": "Cosine"}, + "vectors": {"size": QDRANT_VECTOR_SIZE, "distance": "Cosine"}, "quantization_config": quantization_params, }, headers=self.headers, @@ -171,10 +177,13 @@ class QdrantSemanticCache(BaseCache): prompt += message["content"] # create an embedding for prompt - embedding_response = litellm.embedding( - model=self.embedding_model, - input=prompt, - cache={"no-store": True, "no-cache": True}, + embedding_response = cast( + EmbeddingResponse, + litellm.embedding( + model=self.embedding_model, + input=prompt, + cache={"no-store": True, "no-cache": True}, + ), ) # get the embedding @@ -212,10 +221,13 @@ class QdrantSemanticCache(BaseCache): prompt += message["content"] # convert to embedding - embedding_response = litellm.embedding( - model=self.embedding_model, - input=prompt, - cache={"no-store": True, "no-cache": True}, + embedding_response = cast( + EmbeddingResponse, + litellm.embedding( + model=self.embedding_model, + input=prompt, + cache={"no-store": True, "no-cache": True}, + ), ) # get the embedding diff --git a/litellm/constants.py b/litellm/constants.py index cace674f2f..a2fd373a61 100644 --- a/litellm/constants.py +++ b/litellm/constants.py @@ -9,6 +9,7 @@ DEFAULT_FAILURE_THRESHOLD_PERCENT = ( 0.5 # default cooldown a deployment if 50% of requests fail in a given minute ) DEFAULT_MAX_TOKENS = 4096 +DEFAULT_ALLOWED_FAILS = 3 DEFAULT_REDIS_SYNC_INTERVAL = 1 DEFAULT_COOLDOWN_TIME_SECONDS = 5 DEFAULT_REPLICATE_POLLING_RETRIES = 5 @@ -16,16 +17,71 @@ DEFAULT_REPLICATE_POLLING_DELAY_SECONDS = 1 DEFAULT_IMAGE_TOKEN_COUNT = 250 DEFAULT_IMAGE_WIDTH = 300 DEFAULT_IMAGE_HEIGHT = 300 +DEFAULT_MAX_TOKENS = 256 # used when providers need a default MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB = 1024 # 1MB = 1024KB SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD = 1000 # Minimum number of requests to consider "reasonable traffic". Used for single-deployment cooldown logic. REDIS_UPDATE_BUFFER_KEY = "litellm_spend_update_buffer" REDIS_DAILY_SPEND_UPDATE_BUFFER_KEY = "litellm_daily_spend_update_buffer" MAX_REDIS_BUFFER_DEQUEUE_COUNT = 100 +MINIMUM_PROMPT_CACHE_TOKEN_COUNT = ( + 1024 # minimum number of tokens to cache a prompt by Anthropic +) +DEFAULT_TRIM_RATIO = 0.75 # default ratio of tokens to trim from the end of a prompt +HOURS_IN_A_DAY = 24 +DAYS_IN_A_WEEK = 7 +DAYS_IN_A_MONTH = 28 +DAYS_IN_A_YEAR = 365 +REPLICATE_MODEL_NAME_WITH_ID_LENGTH = 64 +#### TOKEN COUNTING #### +FUNCTION_DEFINITION_TOKEN_COUNT = 9 +SYSTEM_MESSAGE_TOKEN_COUNT = 4 +TOOL_CHOICE_OBJECT_TOKEN_COUNT = 4 +DEFAULT_MOCK_RESPONSE_PROMPT_TOKEN_COUNT = 10 +DEFAULT_MOCK_RESPONSE_COMPLETION_TOKEN_COUNT = 20 +MAX_SHORT_SIDE_FOR_IMAGE_HIGH_RES = 768 +MAX_LONG_SIDE_FOR_IMAGE_HIGH_RES = 2000 +MAX_TILE_WIDTH = 512 +MAX_TILE_HEIGHT = 512 +OPENAI_FILE_SEARCH_COST_PER_1K_CALLS = 2.5 / 1000 +MIN_NON_ZERO_TEMPERATURE = 0.0001 #### RELIABILITY #### REPEATED_STREAMING_CHUNK_LIMIT = 100 # catch if model starts looping the same chunk while streaming. Uses high default to prevent false positives. +DEFAULT_MAX_LRU_CACHE_SIZE = 16 +INITIAL_RETRY_DELAY = 0.5 +MAX_RETRY_DELAY = 8.0 +JITTER = 0.75 +DEFAULT_IN_MEMORY_TTL = 5 # default time to live for the in-memory cache +DEFAULT_POLLING_INTERVAL = 0.03 # default polling interval for the scheduler +AZURE_OPERATION_POLLING_TIMEOUT = 120 +REDIS_SOCKET_TIMEOUT = 0.1 +REDIS_CONNECTION_POOL_TIMEOUT = 5 +NON_LLM_CONNECTION_TIMEOUT = 15 # timeout for adjacent services (e.g. jwt auth) +MAX_EXCEPTION_MESSAGE_LENGTH = 2000 +BEDROCK_MAX_POLICY_SIZE = 75 +REPLICATE_POLLING_DELAY_SECONDS = 0.5 +DEFAULT_ANTHROPIC_CHAT_MAX_TOKENS = 4096 +TOGETHER_AI_4_B = 4 +TOGETHER_AI_8_B = 8 +TOGETHER_AI_21_B = 21 +TOGETHER_AI_41_B = 41 +TOGETHER_AI_80_B = 80 +TOGETHER_AI_110_B = 110 +TOGETHER_AI_EMBEDDING_150_M = 150 +TOGETHER_AI_EMBEDDING_350_M = 350 +QDRANT_SCALAR_QUANTILE = 0.99 +QDRANT_VECTOR_SIZE = 1536 +CACHED_STREAMING_CHUNK_DELAY = 0.02 +MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB = 512 +DEFAULT_MAX_TOKENS_FOR_TRITON = 2000 #### Networking settings #### request_timeout: float = 6000 # time in seconds STREAM_SSE_DONE_STRING: str = "[DONE]" +### SPEND TRACKING ### +DEFAULT_REPLICATE_GPU_PRICE_PER_SECOND = 0.001400 # price per second for a100 80GB +FIREWORKS_AI_56_B_MOE = 56 +FIREWORKS_AI_176_B_MOE = 176 +FIREWORKS_AI_16_B = 16 +FIREWORKS_AI_80_B = 80 LITELLM_CHAT_PROVIDERS = [ "openai", @@ -426,6 +482,9 @@ MCP_TOOL_NAME_PREFIX = "mcp_tool" MAX_SPENDLOG_ROWS_TO_QUERY = ( 1_000_000 # if spendLogs has more than 1M rows, do not query the DB ) +DEFAULT_SOFT_BUDGET = ( + 50.0 # by default all litellm proxy keys have a soft budget of 50.0 +) # makes it clear this is a rate limit error for a litellm virtual key RATE_LIMIT_ERROR_MESSAGE_FOR_VIRTUAL_KEY = "LiteLLM Virtual Key user_api_key_hash" @@ -451,3 +510,14 @@ LITELLM_PROXY_ADMIN_NAME = "default_user_id" ########################### DB CRON JOB NAMES ########################### DB_SPEND_UPDATE_JOB_NAME = "db_spend_update_job" DEFAULT_CRON_JOB_LOCK_TTL_SECONDS = 60 # 1 minute +PROXY_BUDGET_RESCHEDULER_MIN_TIME = 597 +PROXY_BUDGET_RESCHEDULER_MAX_TIME = 605 +PROXY_BATCH_WRITE_AT = 10 # in seconds +DEFAULT_HEALTH_CHECK_INTERVAL = 300 # 5 minutes +PROMETHEUS_FALLBACK_STATS_SEND_TIME_HOURS = 9 +DEFAULT_MODEL_CREATED_AT_TIME = 1677610602 # returns on `/models` endpoint +DEFAULT_SLACK_ALERTING_THRESHOLD = 300 +MAX_TEAM_LIST_LIMIT = 20 +DEFAULT_PROMPT_INJECTION_SIMILARITY_THRESHOLD = 0.7 +LENGTH_OF_LITELLM_GENERATED_KEY = 16 +SECRET_MANAGER_REFRESH_INTERVAL = 86400 diff --git a/litellm/cost_calculator.py b/litellm/cost_calculator.py index de12698658..98c73a4ce7 100644 --- a/litellm/cost_calculator.py +++ b/litellm/cost_calculator.py @@ -9,6 +9,10 @@ from pydantic import BaseModel import litellm import litellm._logging from litellm import verbose_logger +from litellm.constants import ( + DEFAULT_MAX_LRU_CACHE_SIZE, + DEFAULT_REPLICATE_GPU_PRICE_PER_SECOND, +) from litellm.litellm_core_utils.llm_cost_calc.tool_call_cost_tracking import ( StandardBuiltInToolCostTracking, ) @@ -355,9 +359,7 @@ def cost_per_token( # noqa: PLR0915 def get_replicate_completion_pricing(completion_response: dict, total_time=0.0): # see https://replicate.com/pricing # for all litellm currently supported LLMs, almost all requests go to a100_80gb - a100_80gb_price_per_second_public = ( - 0.001400 # assume all calls sent to A100 80GB for now - ) + a100_80gb_price_per_second_public = DEFAULT_REPLICATE_GPU_PRICE_PER_SECOND # assume all calls sent to A100 80GB for now if total_time == 0.0: # total time is in ms start_time = completion_response.get("created", time.time()) end_time = getattr(completion_response, "ended", time.time()) @@ -450,7 +452,7 @@ def _select_model_name_for_cost_calc( return return_model -@lru_cache(maxsize=16) +@lru_cache(maxsize=DEFAULT_MAX_LRU_CACHE_SIZE) def _model_contains_known_llm_provider(model: str) -> bool: """ Check if the model contains a known llm provider diff --git a/litellm/integrations/SlackAlerting/slack_alerting.py b/litellm/integrations/SlackAlerting/slack_alerting.py index 50f0538cfd..9fde042ae7 100644 --- a/litellm/integrations/SlackAlerting/slack_alerting.py +++ b/litellm/integrations/SlackAlerting/slack_alerting.py @@ -16,6 +16,7 @@ import litellm.litellm_core_utils.litellm_logging import litellm.types from litellm._logging import verbose_logger, verbose_proxy_logger from litellm.caching.caching import DualCache +from litellm.constants import HOURS_IN_A_DAY from litellm.integrations.custom_batch_logger import CustomBatchLogger from litellm.litellm_core_utils.duration_parser import duration_in_seconds from litellm.litellm_core_utils.exception_mapping_utils import ( @@ -649,10 +650,10 @@ class SlackAlerting(CustomBatchLogger): event_message += ( f"Budget Crossed\n Total Budget:`{user_info.max_budget}`" ) - elif percent_left <= 0.05: + elif percent_left <= SLACK_ALERTING_THRESHOLD_5_PERCENT: event = "threshold_crossed" event_message += "5% Threshold Crossed " - elif percent_left <= 0.15: + elif percent_left <= SLACK_ALERTING_THRESHOLD_15_PERCENT: event = "threshold_crossed" event_message += "15% Threshold Crossed" elif user_info.soft_budget is not None: @@ -1718,7 +1719,7 @@ Model Info: await self.internal_usage_cache.async_set_cache( key=_event_cache_key, value="SENT", - ttl=(30 * 24 * 60 * 60), # 1 month + ttl=(30 * HOURS_IN_A_DAY * 60 * 60), # 1 month ) except Exception as e: diff --git a/litellm/integrations/datadog/datadog.py b/litellm/integrations/datadog/datadog.py index e9b6b6b164..fb6fee6dc6 100644 --- a/litellm/integrations/datadog/datadog.py +++ b/litellm/integrations/datadog/datadog.py @@ -41,7 +41,7 @@ from litellm.types.utils import StandardLoggingPayload from ..additional_logging_utils import AdditionalLoggingUtils # max number of logs DD API can accept -DD_MAX_BATCH_SIZE = 1000 + # specify what ServiceTypes are logged as success events to DD. (We don't want to spam DD traces with large number of service types) DD_LOGGED_SUCCESS_SERVICE_TYPES = [ diff --git a/litellm/integrations/gcs_bucket/gcs_bucket.py b/litellm/integrations/gcs_bucket/gcs_bucket.py index 187ab779c0..fc98b0948f 100644 --- a/litellm/integrations/gcs_bucket/gcs_bucket.py +++ b/litellm/integrations/gcs_bucket/gcs_bucket.py @@ -20,10 +20,6 @@ else: VertexBase = Any -GCS_DEFAULT_BATCH_SIZE = 2048 -GCS_DEFAULT_FLUSH_INTERVAL_SECONDS = 20 - - class GCSBucketLogger(GCSBucketBase, AdditionalLoggingUtils): def __init__(self, bucket_name: Optional[str] = None) -> None: from litellm.proxy.proxy_server import premium_user diff --git a/litellm/litellm_core_utils/get_llm_provider_logic.py b/litellm/litellm_core_utils/get_llm_provider_logic.py index 037351d0e6..13103c85a0 100644 --- a/litellm/litellm_core_utils/get_llm_provider_logic.py +++ b/litellm/litellm_core_utils/get_llm_provider_logic.py @@ -3,6 +3,7 @@ from typing import Optional, Tuple import httpx import litellm +from litellm.constants import REPLICATE_MODEL_NAME_WITH_ID_LENGTH from litellm.secret_managers.main import get_secret, get_secret_str from ..types.router import LiteLLM_Params @@ -256,10 +257,13 @@ def get_llm_provider( # noqa: PLR0915 elif model in litellm.cohere_chat_models: custom_llm_provider = "cohere_chat" ## replicate - elif model in litellm.replicate_models or (":" in model and len(model) > 64): + elif model in litellm.replicate_models or ( + ":" in model and len(model) > REPLICATE_MODEL_NAME_WITH_ID_LENGTH + ): model_parts = model.split(":") if ( - len(model_parts) > 1 and len(model_parts[1]) == 64 + len(model_parts) > 1 + and len(model_parts[1]) == REPLICATE_MODEL_NAME_WITH_ID_LENGTH ): ## checks if model name has a 64 digit code - e.g. "meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3" custom_llm_provider = "replicate" elif model in litellm.replicate_models: diff --git a/litellm/litellm_core_utils/litellm_logging.py b/litellm/litellm_core_utils/litellm_logging.py index 84825535c9..255cce7336 100644 --- a/litellm/litellm_core_utils/litellm_logging.py +++ b/litellm/litellm_core_utils/litellm_logging.py @@ -28,6 +28,10 @@ from litellm._logging import _is_debugging_on, verbose_logger from litellm.batches.batch_utils import _handle_completed_batch from litellm.caching.caching import DualCache, InMemoryCache from litellm.caching.caching_handler import LLMCachingHandler +from litellm.constants import ( + DEFAULT_MOCK_RESPONSE_COMPLETION_TOKEN_COUNT, + DEFAULT_MOCK_RESPONSE_PROMPT_TOKEN_COUNT, +) from litellm.cost_calculator import _select_model_name_for_cost_calc from litellm.integrations.arize.arize import ArizeLogger from litellm.integrations.custom_guardrail import CustomGuardrail @@ -3745,9 +3749,12 @@ def create_dummy_standard_logging_payload() -> StandardLoggingPayload: response_cost=response_cost, response_cost_failure_debug_info=None, status=str("success"), - total_tokens=int(30), - prompt_tokens=int(20), - completion_tokens=int(10), + total_tokens=int( + DEFAULT_MOCK_RESPONSE_PROMPT_TOKEN_COUNT + + DEFAULT_MOCK_RESPONSE_COMPLETION_TOKEN_COUNT + ), + prompt_tokens=int(DEFAULT_MOCK_RESPONSE_PROMPT_TOKEN_COUNT), + completion_tokens=int(DEFAULT_MOCK_RESPONSE_COMPLETION_TOKEN_COUNT), startTime=start_time, endTime=end_time, completionStartTime=completion_start_time, diff --git a/litellm/litellm_core_utils/llm_cost_calc/tool_call_cost_tracking.py b/litellm/litellm_core_utils/llm_cost_calc/tool_call_cost_tracking.py index 74d15e9a01..34c370ffca 100644 --- a/litellm/litellm_core_utils/llm_cost_calc/tool_call_cost_tracking.py +++ b/litellm/litellm_core_utils/llm_cost_calc/tool_call_cost_tracking.py @@ -5,6 +5,7 @@ Helper utilities for tracking the cost of built-in tools. from typing import Any, Dict, List, Optional import litellm +from litellm.constants import OPENAI_FILE_SEARCH_COST_PER_1K_CALLS from litellm.types.llms.openai import FileSearchTool, WebSearchOptions from litellm.types.utils import ( ModelInfo, @@ -132,7 +133,7 @@ class StandardBuiltInToolCostTracking: """ if file_search is None: return 0.0 - return 2.5 / 1000 + return OPENAI_FILE_SEARCH_COST_PER_1K_CALLS @staticmethod def chat_completion_response_includes_annotations( diff --git a/litellm/litellm_core_utils/token_counter.py b/litellm/litellm_core_utils/token_counter.py index e6bc65ccff..afd5ab5ff4 100644 --- a/litellm/litellm_core_utils/token_counter.py +++ b/litellm/litellm_core_utils/token_counter.py @@ -11,6 +11,10 @@ from litellm.constants import ( DEFAULT_IMAGE_HEIGHT, DEFAULT_IMAGE_TOKEN_COUNT, DEFAULT_IMAGE_WIDTH, + MAX_LONG_SIDE_FOR_IMAGE_HIGH_RES, + MAX_SHORT_SIDE_FOR_IMAGE_HIGH_RES, + MAX_TILE_HEIGHT, + MAX_TILE_WIDTH, ) from litellm.llms.custom_httpx.http_handler import _get_httpx_client @@ -97,11 +101,14 @@ def resize_image_high_res( height: int, ) -> Tuple[int, int]: # Maximum dimensions for high res mode - max_short_side = 768 - max_long_side = 2000 + max_short_side = MAX_SHORT_SIDE_FOR_IMAGE_HIGH_RES + max_long_side = MAX_LONG_SIDE_FOR_IMAGE_HIGH_RES # Return early if no resizing is needed - if width <= 768 and height <= 768: + if ( + width <= MAX_SHORT_SIDE_FOR_IMAGE_HIGH_RES + and height <= MAX_SHORT_SIDE_FOR_IMAGE_HIGH_RES + ): return width, height # Determine the longer and shorter sides @@ -132,7 +139,10 @@ def resize_image_high_res( # Test the function with the given example def calculate_tiles_needed( - resized_width, resized_height, tile_width=512, tile_height=512 + resized_width, + resized_height, + tile_width=MAX_TILE_WIDTH, + tile_height=MAX_TILE_HEIGHT, ): tiles_across = (resized_width + tile_width - 1) // tile_width tiles_down = (resized_height + tile_height - 1) // tile_height diff --git a/litellm/llms/anthropic/chat/transformation.py b/litellm/llms/anthropic/chat/transformation.py index 09096c89e7..64702b4f26 100644 --- a/litellm/llms/anthropic/chat/transformation.py +++ b/litellm/llms/anthropic/chat/transformation.py @@ -5,7 +5,10 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union, cast import httpx import litellm -from litellm.constants import RESPONSE_FORMAT_TOOL_NAME +from litellm.constants import ( + DEFAULT_ANTHROPIC_CHAT_MAX_TOKENS, + RESPONSE_FORMAT_TOOL_NAME, +) from litellm.litellm_core_utils.core_helpers import map_finish_reason from litellm.litellm_core_utils.prompt_templates.factory import anthropic_messages_pt from litellm.llms.base_llm.base_utils import type_to_response_format_param @@ -53,7 +56,7 @@ class AnthropicConfig(BaseConfig): max_tokens: Optional[ int - ] = 4096 # anthropic requires a default value (Opus, Sonnet, and Haiku have the same default) + ] = DEFAULT_ANTHROPIC_CHAT_MAX_TOKENS # anthropic requires a default value (Opus, Sonnet, and Haiku have the same default) stop_sequences: Optional[list] = None temperature: Optional[int] = None top_p: Optional[int] = None @@ -65,7 +68,7 @@ class AnthropicConfig(BaseConfig): self, max_tokens: Optional[ int - ] = 4096, # You can pass in a value yourself or use the default value 4096 + ] = DEFAULT_ANTHROPIC_CHAT_MAX_TOKENS, # You can pass in a value yourself or use the default value 4096 stop_sequences: Optional[list] = None, temperature: Optional[int] = None, top_p: Optional[int] = None, diff --git a/litellm/llms/anthropic/completion/transformation.py b/litellm/llms/anthropic/completion/transformation.py index 5cbc0b5fd8..e4e04df4d6 100644 --- a/litellm/llms/anthropic/completion/transformation.py +++ b/litellm/llms/anthropic/completion/transformation.py @@ -11,6 +11,7 @@ from typing import AsyncIterator, Dict, Iterator, List, Optional, Union import httpx import litellm +from litellm.constants import DEFAULT_MAX_TOKENS from litellm.litellm_core_utils.prompt_templates.factory import ( custom_prompt, prompt_factory, @@ -65,7 +66,9 @@ class AnthropicTextConfig(BaseConfig): def __init__( self, - max_tokens_to_sample: Optional[int] = 256, # anthropic requires a default + max_tokens_to_sample: Optional[ + int + ] = DEFAULT_MAX_TOKENS, # anthropic requires a default stop_sequences: Optional[list] = None, temperature: Optional[int] = None, top_p: Optional[int] = None, diff --git a/litellm/llms/azure/azure.py b/litellm/llms/azure/azure.py index aed813fdab..bb60680ebc 100644 --- a/litellm/llms/azure/azure.py +++ b/litellm/llms/azure/azure.py @@ -7,7 +7,7 @@ import httpx # type: ignore from openai import APITimeoutError, AsyncAzureOpenAI, AzureOpenAI import litellm -from litellm.constants import DEFAULT_MAX_RETRIES +from litellm.constants import AZURE_OPERATION_POLLING_TIMEOUT, DEFAULT_MAX_RETRIES from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj from litellm.litellm_core_utils.logging_utils import track_llm_api_timing from litellm.llms.custom_httpx.http_handler import ( @@ -857,7 +857,7 @@ class AzureChatCompletion(BaseAzureLLM, BaseLLM): await response.aread() - timeout_secs: int = 120 + timeout_secs: int = AZURE_OPERATION_POLLING_TIMEOUT start_time = time.time() if "status" not in response.json(): raise Exception( @@ -955,7 +955,7 @@ class AzureChatCompletion(BaseAzureLLM, BaseLLM): response.read() - timeout_secs: int = 120 + timeout_secs: int = AZURE_OPERATION_POLLING_TIMEOUT start_time = time.time() if "status" not in response.json(): raise Exception( diff --git a/litellm/llms/azure/chat/gpt_transformation.py b/litellm/llms/azure/chat/gpt_transformation.py index ee85517e66..e30d68f97d 100644 --- a/litellm/llms/azure/chat/gpt_transformation.py +++ b/litellm/llms/azure/chat/gpt_transformation.py @@ -7,6 +7,10 @@ from litellm.litellm_core_utils.prompt_templates.factory import ( convert_to_azure_openai_messages, ) from litellm.llms.base_llm.chat.transformation import BaseLLMException +from litellm.types.llms.azure import ( + API_VERSION_MONTH_SUPPORTED_RESPONSE_FORMAT, + API_VERSION_YEAR_SUPPORTED_RESPONSE_FORMAT, +) from litellm.types.utils import ModelResponse from litellm.utils import supports_response_schema @@ -123,7 +127,10 @@ class AzureOpenAIConfig(BaseConfig): - check if api_version is supported for response_format """ - is_supported = int(api_version_year) <= 2024 and int(api_version_month) >= 8 + is_supported = ( + int(api_version_year) <= API_VERSION_YEAR_SUPPORTED_RESPONSE_FORMAT + and int(api_version_month) >= API_VERSION_MONTH_SUPPORTED_RESPONSE_FORMAT + ) return is_supported diff --git a/litellm/llms/bedrock/base_aws_llm.py b/litellm/llms/bedrock/base_aws_llm.py index 5482d80687..133ef6a952 100644 --- a/litellm/llms/bedrock/base_aws_llm.py +++ b/litellm/llms/bedrock/base_aws_llm.py @@ -9,7 +9,7 @@ from pydantic import BaseModel from litellm._logging import verbose_logger from litellm.caching.caching import DualCache -from litellm.constants import BEDROCK_INVOKE_PROVIDERS_LITERAL +from litellm.constants import BEDROCK_INVOKE_PROVIDERS_LITERAL, BEDROCK_MAX_POLICY_SIZE from litellm.litellm_core_utils.dd_tracing import tracer from litellm.secret_managers.main import get_secret @@ -381,7 +381,7 @@ class BaseAWSLLM: "region_name": aws_region_name, } - if sts_response["PackedPolicySize"] > 75: + if sts_response["PackedPolicySize"] > BEDROCK_MAX_POLICY_SIZE: verbose_logger.warning( f"The policy size is greater than 75% of the allowed size, PackedPolicySize: {sts_response['PackedPolicySize']}" ) diff --git a/litellm/llms/deepinfra/chat/transformation.py b/litellm/llms/deepinfra/chat/transformation.py index 429759fad1..0d446d39b9 100644 --- a/litellm/llms/deepinfra/chat/transformation.py +++ b/litellm/llms/deepinfra/chat/transformation.py @@ -1,6 +1,7 @@ from typing import Optional, Tuple, Union import litellm +from litellm.constants import MIN_NON_ZERO_TEMPERATURE from litellm.llms.openai.chat.gpt_transformation import OpenAIGPTConfig from litellm.secret_managers.main import get_secret_str @@ -84,7 +85,7 @@ class DeepInfraConfig(OpenAIGPTConfig): and value == 0 and model == "mistralai/Mistral-7B-Instruct-v0.1" ): # this model does no support temperature == 0 - value = 0.0001 # close to 0 + value = MIN_NON_ZERO_TEMPERATURE # close to 0 if param == "tool_choice": if ( value != "auto" and value != "none" diff --git a/litellm/llms/fireworks_ai/cost_calculator.py b/litellm/llms/fireworks_ai/cost_calculator.py index f53aba4a47..31414625ab 100644 --- a/litellm/llms/fireworks_ai/cost_calculator.py +++ b/litellm/llms/fireworks_ai/cost_calculator.py @@ -4,6 +4,12 @@ For calculating cost of fireworks ai serverless inference models. from typing import Tuple +from litellm.constants import ( + FIREWORKS_AI_16_B, + FIREWORKS_AI_56_B_MOE, + FIREWORKS_AI_80_B, + FIREWORKS_AI_176_B_MOE, +) from litellm.types.utils import Usage from litellm.utils import get_model_info @@ -25,9 +31,9 @@ def get_base_model_for_pricing(model_name: str) -> str: moe_match = re.search(r"(\d+)x(\d+)b", model_name) if moe_match: total_billion = int(moe_match.group(1)) * int(moe_match.group(2)) - if total_billion <= 56: + if total_billion <= FIREWORKS_AI_56_B_MOE: return "fireworks-ai-moe-up-to-56b" - elif total_billion <= 176: + elif total_billion <= FIREWORKS_AI_176_B_MOE: return "fireworks-ai-56b-to-176b" # Check for standard models in the form b @@ -37,9 +43,9 @@ def get_base_model_for_pricing(model_name: str) -> str: params_billion = float(params_match) # Determine the category based on the number of parameters - if params_billion <= 16.0: + if params_billion <= FIREWORKS_AI_16_B: return "fireworks-ai-up-to-16b" - elif params_billion <= 80.0: + elif params_billion <= FIREWORKS_AI_80_B: return "fireworks-ai-16b-80b" # If no matches, return the original model_name diff --git a/litellm/llms/predibase/chat/transformation.py b/litellm/llms/predibase/chat/transformation.py index f1a2163d24..8ef0eea173 100644 --- a/litellm/llms/predibase/chat/transformation.py +++ b/litellm/llms/predibase/chat/transformation.py @@ -2,6 +2,7 @@ from typing import TYPE_CHECKING, Any, List, Literal, Optional, Union from httpx import Headers, Response +from litellm.constants import DEFAULT_MAX_TOKENS from litellm.llms.base_llm.chat.transformation import BaseConfig, BaseLLMException from litellm.types.llms.openai import AllMessageValues from litellm.types.utils import ModelResponse @@ -27,7 +28,7 @@ class PredibaseConfig(BaseConfig): decoder_input_details: Optional[bool] = None details: bool = True # enables returning logprobs + best of max_new_tokens: int = ( - 256 # openai default - requests hang if max_new_tokens not given + DEFAULT_MAX_TOKENS # openai default - requests hang if max_new_tokens not given ) repetition_penalty: Optional[float] = None return_full_text: Optional[ diff --git a/litellm/llms/replicate/chat/handler.py b/litellm/llms/replicate/chat/handler.py index 7991c61ee3..d954416381 100644 --- a/litellm/llms/replicate/chat/handler.py +++ b/litellm/llms/replicate/chat/handler.py @@ -4,6 +4,7 @@ import time from typing import Callable, List, Union import litellm +from litellm.constants import REPLICATE_POLLING_DELAY_SECONDS from litellm.llms.custom_httpx.http_handler import ( AsyncHTTPHandler, HTTPHandler, @@ -28,7 +29,9 @@ def handle_prediction_response_streaming( status = "" while True and (status not in ["succeeded", "failed", "canceled"]): - time.sleep(0.5) # prevent being rate limited by replicate + time.sleep( + REPLICATE_POLLING_DELAY_SECONDS + ) # prevent being rate limited by replicate print_verbose(f"replicate: polling endpoint: {prediction_url}") response = http_client.get(prediction_url, headers=headers) if response.status_code == 200: @@ -77,7 +80,9 @@ async def async_handle_prediction_response_streaming( status = "" while True and (status not in ["succeeded", "failed", "canceled"]): - await asyncio.sleep(0.5) # prevent being rate limited by replicate + await asyncio.sleep( + REPLICATE_POLLING_DELAY_SECONDS + ) # prevent being rate limited by replicate print_verbose(f"replicate: polling endpoint: {prediction_url}") response = await http_client.get(prediction_url, headers=headers) if response.status_code == 200: diff --git a/litellm/llms/replicate/chat/transformation.py b/litellm/llms/replicate/chat/transformation.py index d49350dea7..604e6eefe6 100644 --- a/litellm/llms/replicate/chat/transformation.py +++ b/litellm/llms/replicate/chat/transformation.py @@ -3,6 +3,7 @@ from typing import TYPE_CHECKING, Any, List, Optional, Union import httpx import litellm +from litellm.constants import REPLICATE_MODEL_NAME_WITH_ID_LENGTH from litellm.litellm_core_utils.prompt_templates.common_utils import ( convert_content_list_to_str, ) @@ -221,10 +222,11 @@ class ReplicateConfig(BaseConfig): version_id = self.model_to_version_id(model) request_data: dict = {"input": input_data} - if ":" in version_id and len(version_id) > 64: + if ":" in version_id and len(version_id) > REPLICATE_MODEL_NAME_WITH_ID_LENGTH: model_parts = version_id.split(":") if ( - len(model_parts) > 1 and len(model_parts[1]) == 64 + len(model_parts) > 1 + and len(model_parts[1]) == REPLICATE_MODEL_NAME_WITH_ID_LENGTH ): ## checks if model name has a 64 digit code - e.g. "meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3" request_data["version"] = model_parts[1] diff --git a/litellm/llms/together_ai/cost_calculator.py b/litellm/llms/together_ai/cost_calculator.py index d3b0db8b89..a1be097bc8 100644 --- a/litellm/llms/together_ai/cost_calculator.py +++ b/litellm/llms/together_ai/cost_calculator.py @@ -4,6 +4,16 @@ Handles calculating cost for together ai models import re +from litellm.constants import ( + TOGETHER_AI_4_B, + TOGETHER_AI_8_B, + TOGETHER_AI_21_B, + TOGETHER_AI_41_B, + TOGETHER_AI_80_B, + TOGETHER_AI_110_B, + TOGETHER_AI_EMBEDDING_150_M, + TOGETHER_AI_EMBEDDING_350_M, +) from litellm.types.utils import CallTypes @@ -31,17 +41,17 @@ def get_model_params_and_category(model_name, call_type: CallTypes) -> str: else: return model_name # Determine the category based on the number of parameters - if params_billion <= 4.0: + if params_billion <= TOGETHER_AI_4_B: category = "together-ai-up-to-4b" - elif params_billion <= 8.0: + elif params_billion <= TOGETHER_AI_8_B: category = "together-ai-4.1b-8b" - elif params_billion <= 21.0: + elif params_billion <= TOGETHER_AI_21_B: category = "together-ai-8.1b-21b" - elif params_billion <= 41.0: + elif params_billion <= TOGETHER_AI_41_B: category = "together-ai-21.1b-41b" - elif params_billion <= 80.0: + elif params_billion <= TOGETHER_AI_80_B: category = "together-ai-41.1b-80b" - elif params_billion <= 110.0: + elif params_billion <= TOGETHER_AI_110_B: category = "together-ai-81.1b-110b" if category is not None: return category @@ -69,9 +79,9 @@ def get_model_params_and_category_embeddings(model_name) -> str: else: return model_name # Determine the category based on the number of parameters - if params_million <= 150: + if params_million <= TOGETHER_AI_EMBEDDING_150_M: category = "together-ai-embedding-up-to-150m" - elif params_million <= 350: + elif params_million <= TOGETHER_AI_EMBEDDING_350_M: category = "together-ai-embedding-151m-to-350m" if category is not None: return category diff --git a/litellm/llms/triton/completion/transformation.py b/litellm/llms/triton/completion/transformation.py index db0add6f35..49126917f2 100644 --- a/litellm/llms/triton/completion/transformation.py +++ b/litellm/llms/triton/completion/transformation.py @@ -7,6 +7,7 @@ from typing import Any, AsyncIterator, Dict, Iterator, List, Literal, Optional, from httpx import Headers, Response +from litellm.constants import DEFAULT_MAX_TOKENS_FOR_TRITON from litellm.litellm_core_utils.prompt_templates.factory import prompt_factory from litellm.llms.base_llm.base_model_iterator import BaseModelResponseIterator from litellm.llms.base_llm.chat.transformation import ( @@ -196,7 +197,9 @@ class TritonGenerateConfig(TritonConfig): data_for_triton: Dict[str, Any] = { "text_input": prompt_factory(model=model, messages=messages), "parameters": { - "max_tokens": int(optional_params.get("max_tokens", 2000)), + "max_tokens": int( + optional_params.get("max_tokens", DEFAULT_MAX_TOKENS_FOR_TRITON) + ), "bad_words": [""], "stop_words": [""], }, diff --git a/litellm/main.py b/litellm/main.py index 56b0aa3671..5d058c0c44 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -51,6 +51,10 @@ from litellm import ( # type: ignore get_litellm_params, get_optional_params, ) +from litellm.constants import ( + DEFAULT_MOCK_RESPONSE_COMPLETION_TOKEN_COUNT, + DEFAULT_MOCK_RESPONSE_PROMPT_TOKEN_COUNT, +) from litellm.exceptions import LiteLLMUnknownProvider from litellm.integrations.custom_logger import CustomLogger from litellm.litellm_core_utils.audio_utils.utils import get_audio_file_for_health_check @@ -740,7 +744,12 @@ def mock_completion( setattr( model_response, "usage", - Usage(prompt_tokens=10, completion_tokens=20, total_tokens=30), + Usage( + prompt_tokens=DEFAULT_MOCK_RESPONSE_PROMPT_TOKEN_COUNT, + completion_tokens=DEFAULT_MOCK_RESPONSE_COMPLETION_TOKEN_COUNT, + total_tokens=DEFAULT_MOCK_RESPONSE_PROMPT_TOKEN_COUNT + + DEFAULT_MOCK_RESPONSE_COMPLETION_TOKEN_COUNT, + ), ) try: @@ -3067,7 +3076,7 @@ def completion( # type: ignore # noqa: PLR0915 "max_tokens": max_tokens, "temperature": temperature, "top_p": top_p, - "top_k": kwargs.get("top_k", 40), + "top_k": kwargs.get("top_k"), }, }, ) diff --git a/litellm/proxy/auth/auth_checks.py b/litellm/proxy/auth/auth_checks.py index ddd1008bd0..1e0c8a4609 100644 --- a/litellm/proxy/auth/auth_checks.py +++ b/litellm/proxy/auth/auth_checks.py @@ -20,6 +20,7 @@ import litellm from litellm._logging import verbose_proxy_logger from litellm.caching.caching import DualCache from litellm.caching.dual_cache import LimitedSizeOrderedDict +from litellm.constants import DEFAULT_IN_MEMORY_TTL from litellm.litellm_core_utils.get_llm_provider_logic import get_llm_provider from litellm.proxy._types import ( RBAC_ROLES, @@ -55,7 +56,7 @@ else: last_db_access_time = LimitedSizeOrderedDict(max_size=100) -db_cache_expiry = 5 # refresh every 5s +db_cache_expiry = DEFAULT_IN_MEMORY_TTL # refresh every 5s all_routes = LiteLLMRoutes.openai_routes.value + LiteLLMRoutes.management_routes.value diff --git a/litellm/proxy/auth/litellm_license.py b/litellm/proxy/auth/litellm_license.py index d962aad2c0..936f372181 100644 --- a/litellm/proxy/auth/litellm_license.py +++ b/litellm/proxy/auth/litellm_license.py @@ -9,6 +9,7 @@ from typing import Optional import httpx from litellm._logging import verbose_proxy_logger +from litellm.constants import NON_LLM_CONNECTION_TIMEOUT from litellm.llms.custom_httpx.http_handler import HTTPHandler @@ -23,7 +24,7 @@ class LicenseCheck: def __init__(self) -> None: self.license_str = os.getenv("LITELLM_LICENSE", None) verbose_proxy_logger.debug("License Str value - {}".format(self.license_str)) - self.http_handler = HTTPHandler(timeout=15) + self.http_handler = HTTPHandler(timeout=NON_LLM_CONNECTION_TIMEOUT) self.public_key = None self.read_public_key() diff --git a/litellm/proxy/hooks/prompt_injection_detection.py b/litellm/proxy/hooks/prompt_injection_detection.py index b8fa8466a3..ee5d192555 100644 --- a/litellm/proxy/hooks/prompt_injection_detection.py +++ b/litellm/proxy/hooks/prompt_injection_detection.py @@ -15,6 +15,7 @@ from fastapi import HTTPException import litellm from litellm._logging import verbose_proxy_logger from litellm.caching.caching import DualCache +from litellm.constants import DEFAULT_PROMPT_INJECTION_SIMILARITY_THRESHOLD from litellm.integrations.custom_logger import CustomLogger from litellm.litellm_core_utils.prompt_templates.factory import ( prompt_injection_detection_default_pt, @@ -110,7 +111,9 @@ class _OPTIONAL_PromptInjectionDetection(CustomLogger): return combinations def check_user_input_similarity( - self, user_input: str, similarity_threshold: float = 0.7 + self, + user_input: str, + similarity_threshold: float = DEFAULT_PROMPT_INJECTION_SIMILARITY_THRESHOLD, ) -> bool: user_input_lower = user_input.lower() keywords = self.generate_injection_keywords() diff --git a/litellm/proxy/management_endpoints/key_management_endpoints.py b/litellm/proxy/management_endpoints/key_management_endpoints.py index b0bf1fb619..f78ac8744c 100644 --- a/litellm/proxy/management_endpoints/key_management_endpoints.py +++ b/litellm/proxy/management_endpoints/key_management_endpoints.py @@ -24,7 +24,7 @@ from fastapi import APIRouter, Depends, Header, HTTPException, Query, Request, s import litellm from litellm._logging import verbose_proxy_logger from litellm.caching import DualCache -from litellm.constants import UI_SESSION_TOKEN_TEAM_ID +from litellm.constants import LENGTH_OF_LITELLM_GENERATED_KEY, UI_SESSION_TOKEN_TEAM_ID from litellm.litellm_core_utils.duration_parser import duration_in_seconds from litellm.proxy._types import * from litellm.proxy.auth.auth_checks import ( @@ -1164,7 +1164,7 @@ async def generate_key_helper_fn( # noqa: PLR0915 if key is not None: token = key else: - token = f"sk-{secrets.token_urlsafe(16)}" + token = f"sk-{secrets.token_urlsafe(LENGTH_OF_LITELLM_GENERATED_KEY)}" if duration is None: # allow tokens that never expire expires = None @@ -1745,7 +1745,7 @@ async def regenerate_key_fn( verbose_proxy_logger.debug("key_in_db: %s", _key_in_db) - new_token = f"sk-{secrets.token_urlsafe(16)}" + new_token = f"sk-{secrets.token_urlsafe(LENGTH_OF_LITELLM_GENERATED_KEY)}" new_token_hash = hash_token(new_token) new_token_key_name = f"sk-...{new_token[-4:]}" diff --git a/litellm/proxy/pass_through_endpoints/llm_provider_handlers/assembly_passthrough_logging_handler.py b/litellm/proxy/pass_through_endpoints/llm_provider_handlers/assembly_passthrough_logging_handler.py index 7cf3013db0..cba558248d 100644 --- a/litellm/proxy/pass_through_endpoints/llm_provider_handlers/assembly_passthrough_logging_handler.py +++ b/litellm/proxy/pass_through_endpoints/llm_provider_handlers/assembly_passthrough_logging_handler.py @@ -15,6 +15,10 @@ from litellm.litellm_core_utils.litellm_logging import ( ) from litellm.litellm_core_utils.thread_pool_executor import executor from litellm.proxy.pass_through_endpoints.types import PassthroughStandardLoggingPayload +from litellm.types.passthrough_endpoints.assembly_ai import ( + ASSEMBLY_AI_MAX_POLLING_ATTEMPTS, + ASSEMBLY_AI_POLLING_INTERVAL, +) class AssemblyAITranscriptResponse(TypedDict, total=False): @@ -34,13 +38,13 @@ class AssemblyAIPassthroughLoggingHandler: The base URL for the AssemblyAI API """ - self.polling_interval: float = 10 + self.polling_interval: float = ASSEMBLY_AI_POLLING_INTERVAL """ The polling interval for the AssemblyAI API. litellm needs to poll the GET /transcript/{transcript_id} endpoint to get the status of the transcript. """ - self.max_polling_attempts = 180 + self.max_polling_attempts = ASSEMBLY_AI_MAX_POLLING_ATTEMPTS """ The maximum number of polling attempts for the AssemblyAI API. """ diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py index d265f3bbca..100b0bf6db 100644 --- a/litellm/proxy/proxy_server.py +++ b/litellm/proxy/proxy_server.py @@ -25,7 +25,10 @@ from typing import ( get_type_hints, ) -from litellm.constants import DEFAULT_MAX_RECURSE_DEPTH +from litellm.constants import ( + DEFAULT_MAX_RECURSE_DEPTH, + DEFAULT_SLACK_ALERTING_THRESHOLD, +) from litellm.types.utils import ( ModelResponse, ModelResponseStream, @@ -118,7 +121,16 @@ import litellm from litellm import Router from litellm._logging import verbose_proxy_logger, verbose_router_logger from litellm.caching.caching import DualCache, RedisCache -from litellm.constants import LITELLM_PROXY_ADMIN_NAME +from litellm.constants import ( + DAYS_IN_A_MONTH, + DEFAULT_HEALTH_CHECK_INTERVAL, + DEFAULT_MODEL_CREATED_AT_TIME, + LITELLM_PROXY_ADMIN_NAME, + PROMETHEUS_FALLBACK_STATS_SEND_TIME_HOURS, + PROXY_BATCH_WRITE_AT, + PROXY_BUDGET_RESCHEDULER_MAX_TIME, + PROXY_BUDGET_RESCHEDULER_MIN_TIME, +) from litellm.exceptions import RejectedRequestError from litellm.integrations.SlackAlerting.slack_alerting import SlackAlerting from litellm.litellm_core_utils.core_helpers import ( @@ -287,7 +299,7 @@ from litellm.router import ( LiteLLM_Params, ModelGroupInfo, ) -from litellm.scheduler import DefaultPriorities, FlowItem, Scheduler +from litellm.scheduler import FlowItem, Scheduler from litellm.secret_managers.aws_secret_manager import load_aws_kms from litellm.secret_managers.google_kms import load_google_kms from litellm.secret_managers.main import ( @@ -307,6 +319,7 @@ from litellm.types.llms.openai import HttpxBinaryResponseContent from litellm.types.router import DeploymentTypedDict from litellm.types.router import ModelInfo as RouterModelInfo from litellm.types.router import RouterGeneralSettings, updateDeployment +from litellm.types.scheduler import DefaultPriorities from litellm.types.utils import CredentialItem, CustomHuggingfaceTokenizer from litellm.types.utils import ModelInfo as ModelMapInfo from litellm.types.utils import RawRequestTypedDict, StandardLoggingPayload @@ -779,9 +792,9 @@ queue: List = [] litellm_proxy_budget_name = "litellm-proxy-budget" litellm_proxy_admin_name = LITELLM_PROXY_ADMIN_NAME ui_access_mode: Literal["admin", "all"] = "all" -proxy_budget_rescheduler_min_time = 597 -proxy_budget_rescheduler_max_time = 605 -proxy_batch_write_at = 10 # in seconds +proxy_budget_rescheduler_min_time = PROXY_BUDGET_RESCHEDULER_MIN_TIME +proxy_budget_rescheduler_max_time = PROXY_BUDGET_RESCHEDULER_MAX_TIME +proxy_batch_write_at = PROXY_BATCH_WRITE_AT litellm_master_key_hash = None disable_spend_logs = False jwt_handler = JWTHandler() @@ -1846,7 +1859,9 @@ class ProxyConfig: use_background_health_checks = general_settings.get( "background_health_checks", False ) - health_check_interval = general_settings.get("health_check_interval", 300) + health_check_interval = general_settings.get( + "health_check_interval", DEFAULT_HEALTH_CHECK_INTERVAL + ) health_check_details = general_settings.get("health_check_details", True) ### RBAC ### @@ -3145,7 +3160,7 @@ class ProxyStartupEvent: scheduler.add_job( proxy_logging_obj.slack_alerting_instance.send_fallback_stats_from_prometheus, "cron", - hour=9, + hour=PROMETHEUS_FALLBACK_STATS_SEND_TIME_HOURS, minute=0, timezone=ZoneInfo("America/Los_Angeles"), # Pacific Time ) @@ -3278,7 +3293,7 @@ async def model_list( { "id": model, "object": "model", - "created": 1677610602, + "created": DEFAULT_MODEL_CREATED_AT_TIME, "owned_by": "openai", } for model in all_models @@ -5592,7 +5607,7 @@ async def model_metrics( param="None", code=status.HTTP_500_INTERNAL_SERVER_ERROR, ) - startTime = startTime or datetime.now() - timedelta(days=30) + startTime = startTime or datetime.now() - timedelta(days=DAYS_IN_A_MONTH) endTime = endTime or datetime.now() if api_key is None or api_key == "undefined": @@ -5713,11 +5728,12 @@ async def model_metrics_slow_responses( if customer is None or customer == "undefined": customer = "null" - startTime = startTime or datetime.now() - timedelta(days=30) + startTime = startTime or datetime.now() - timedelta(days=DAYS_IN_A_MONTH) endTime = endTime or datetime.now() alerting_threshold = ( - proxy_logging_obj.slack_alerting_instance.alerting_threshold or 300 + proxy_logging_obj.slack_alerting_instance.alerting_threshold + or DEFAULT_SLACK_ALERTING_THRESHOLD ) alerting_threshold = int(alerting_threshold) @@ -5797,7 +5813,7 @@ async def model_metrics_exceptions( code=status.HTTP_500_INTERNAL_SERVER_ERROR, ) - startTime = startTime or datetime.now() - timedelta(days=30) + startTime = startTime or datetime.now() - timedelta(days=DAYS_IN_A_MONTH) endTime = endTime or datetime.now() if api_key is None or api_key == "undefined": diff --git a/litellm/proxy/utils.py b/litellm/proxy/utils.py index eb733e7370..7831d42d81 100644 --- a/litellm/proxy/utils.py +++ b/litellm/proxy/utils.py @@ -22,6 +22,7 @@ from typing import ( overload, ) +from litellm.constants import MAX_TEAM_LIST_LIMIT from litellm.proxy._types import ( DB_CONNECTION_ERROR_TYPES, CommonProxyErrors, @@ -1596,7 +1597,9 @@ class PrismaClient: where={"team_id": {"in": team_id_list}} ) elif query_type == "find_all" and team_id_list is None: - response = await self.db.litellm_teamtable.find_many(take=20) + response = await self.db.litellm_teamtable.find_many( + take=MAX_TEAM_LIST_LIMIT + ) return response elif table_name == "user_notification": if query_type == "find_unique": diff --git a/litellm/router.py b/litellm/router.py index 78ad2afe1a..b0a04abcaa 100644 --- a/litellm/router.py +++ b/litellm/router.py @@ -50,6 +50,7 @@ from litellm.caching.caching import ( RedisCache, RedisClusterCache, ) +from litellm.constants import DEFAULT_MAX_LRU_CACHE_SIZE from litellm.integrations.custom_logger import CustomLogger from litellm.litellm_core_utils.asyncify import run_async_function from litellm.litellm_core_utils.core_helpers import _get_parent_otel_span_from_kwargs @@ -5073,7 +5074,7 @@ class Router: rpm_usage += t return tpm_usage, rpm_usage - @lru_cache(maxsize=64) + @lru_cache(maxsize=DEFAULT_MAX_LRU_CACHE_SIZE) def _cached_get_model_group_info( self, model_group: str ) -> Optional[ModelGroupInfo]: diff --git a/litellm/router_utils/handle_error.py b/litellm/router_utils/handle_error.py index c331da70ac..ba12e1cbed 100644 --- a/litellm/router_utils/handle_error.py +++ b/litellm/router_utils/handle_error.py @@ -1,6 +1,7 @@ from typing import TYPE_CHECKING, Any, Optional, Union from litellm._logging import verbose_router_logger +from litellm.constants import MAX_EXCEPTION_MESSAGE_LENGTH from litellm.router_utils.cooldown_handlers import ( _async_get_cooldown_deployments_with_debug_info, ) @@ -54,7 +55,7 @@ async def send_llm_exception_alert( exception_str = str(original_exception) if litellm_debug_info is not None: exception_str += litellm_debug_info - exception_str += f"\n\n{error_traceback_str[:2000]}" + exception_str += f"\n\n{error_traceback_str[:MAX_EXCEPTION_MESSAGE_LENGTH]}" await litellm_router_instance.slack_alerting_logger.send_alert( message=f"LLM API call failed: `{exception_str}`", diff --git a/litellm/scheduler.py b/litellm/scheduler.py index 23346e982a..3225ba0451 100644 --- a/litellm/scheduler.py +++ b/litellm/scheduler.py @@ -6,17 +6,14 @@ from pydantic import BaseModel from litellm import print_verbose from litellm.caching.caching import DualCache, RedisCache +from litellm.constants import DEFAULT_IN_MEMORY_TTL, DEFAULT_POLLING_INTERVAL class SchedulerCacheKeys(enum.Enum): queue = "scheduler:queue" - default_in_memory_ttl = 5 # cache queue in-memory for 5s when redis cache available - - -class DefaultPriorities(enum.Enum): - High = 0 - Medium = 128 - Low = 255 + default_in_memory_ttl = ( + DEFAULT_IN_MEMORY_TTL # cache queue in-memory for 5s when redis cache available + ) class FlowItem(BaseModel): @@ -44,7 +41,9 @@ class Scheduler: self.cache = DualCache( redis_cache=redis_cache, default_in_memory_ttl=default_in_memory_ttl ) - self.polling_interval = polling_interval or 0.03 # default to 3ms + self.polling_interval = ( + polling_interval or DEFAULT_POLLING_INTERVAL + ) # default to 3ms async def add_request(self, request: FlowItem): # We use the priority directly, as lower values indicate higher priority diff --git a/litellm/secret_managers/google_secret_manager.py b/litellm/secret_managers/google_secret_manager.py index f21963c38a..2fd35ced6e 100644 --- a/litellm/secret_managers/google_secret_manager.py +++ b/litellm/secret_managers/google_secret_manager.py @@ -5,6 +5,7 @@ from typing import Optional import litellm from litellm._logging import verbose_logger from litellm.caching.caching import InMemoryCache +from litellm.constants import SECRET_MANAGER_REFRESH_INTERVAL from litellm.integrations.gcs_bucket.gcs_bucket_base import GCSBucketBase from litellm.llms.custom_httpx.http_handler import _get_httpx_client from litellm.proxy._types import CommonProxyErrors, KeyManagementSystem @@ -13,7 +14,7 @@ from litellm.proxy._types import CommonProxyErrors, KeyManagementSystem class GoogleSecretManager(GCSBucketBase): def __init__( self, - refresh_interval: Optional[int] = 86400, + refresh_interval: Optional[int] = SECRET_MANAGER_REFRESH_INTERVAL, always_read_secret_manager: Optional[bool] = False, ) -> None: """ diff --git a/litellm/secret_managers/hashicorp_secret_manager.py b/litellm/secret_managers/hashicorp_secret_manager.py index e0b4a08ce8..e5911ffa9b 100644 --- a/litellm/secret_managers/hashicorp_secret_manager.py +++ b/litellm/secret_managers/hashicorp_secret_manager.py @@ -6,6 +6,7 @@ import httpx import litellm from litellm._logging import verbose_logger from litellm.caching import InMemoryCache +from litellm.constants import SECRET_MANAGER_REFRESH_INTERVAL from litellm.llms.custom_httpx.http_handler import ( _get_httpx_client, get_async_httpx_client, @@ -39,8 +40,14 @@ class HashicorpSecretManager(BaseSecretManager): litellm.secret_manager_client = self litellm._key_management_system = KeyManagementSystem.HASHICORP_VAULT - _refresh_interval = os.environ.get("HCP_VAULT_REFRESH_INTERVAL", 86400) - _refresh_interval = int(_refresh_interval) if _refresh_interval else 86400 + _refresh_interval = os.environ.get( + "HCP_VAULT_REFRESH_INTERVAL", SECRET_MANAGER_REFRESH_INTERVAL + ) + _refresh_interval = ( + int(_refresh_interval) + if _refresh_interval + else SECRET_MANAGER_REFRESH_INTERVAL + ) self.cache = InMemoryCache( default_ttl=_refresh_interval ) # store in memory for 1 day diff --git a/litellm/types/integrations/datadog.py b/litellm/types/integrations/datadog.py index 79d4eded47..7ea25561f9 100644 --- a/litellm/types/integrations/datadog.py +++ b/litellm/types/integrations/datadog.py @@ -1,6 +1,8 @@ from enum import Enum from typing import Optional, TypedDict +DD_MAX_BATCH_SIZE = 1000 + class DataDogStatus(str, Enum): INFO = "info" diff --git a/litellm/types/integrations/gcs_bucket.py b/litellm/types/integrations/gcs_bucket.py index a4fd8a6a11..9f5065ced2 100644 --- a/litellm/types/integrations/gcs_bucket.py +++ b/litellm/types/integrations/gcs_bucket.py @@ -8,6 +8,10 @@ else: VertexBase = Any +GCS_DEFAULT_BATCH_SIZE = 2048 +GCS_DEFAULT_FLUSH_INTERVAL_SECONDS = 20 + + class GCSLoggingConfig(TypedDict): """ Internal LiteLLM Config for GCS Bucket logging diff --git a/litellm/types/integrations/slack_alerting.py b/litellm/types/integrations/slack_alerting.py index 9019b098d9..052fd05ea8 100644 --- a/litellm/types/integrations/slack_alerting.py +++ b/litellm/types/integrations/slack_alerting.py @@ -7,6 +7,9 @@ from pydantic import BaseModel, Field from litellm.types.utils import LiteLLMPydanticObjectBase +SLACK_ALERTING_THRESHOLD_5_PERCENT = 0.05 +SLACK_ALERTING_THRESHOLD_15_PERCENT = 0.15 + class BaseOutageModel(TypedDict): alerts: List[int] diff --git a/litellm/types/llms/azure.py b/litellm/types/llms/azure.py new file mode 100644 index 0000000000..36c4258abd --- /dev/null +++ b/litellm/types/llms/azure.py @@ -0,0 +1,2 @@ +API_VERSION_YEAR_SUPPORTED_RESPONSE_FORMAT = 2024 +API_VERSION_MONTH_SUPPORTED_RESPONSE_FORMAT = 8 diff --git a/litellm/types/llms/triton.py b/litellm/types/llms/triton.py new file mode 100644 index 0000000000..8b13789179 --- /dev/null +++ b/litellm/types/llms/triton.py @@ -0,0 +1 @@ + diff --git a/litellm/types/passthrough_endpoints/assembly_ai.py b/litellm/types/passthrough_endpoints/assembly_ai.py new file mode 100644 index 0000000000..91b7273a48 --- /dev/null +++ b/litellm/types/passthrough_endpoints/assembly_ai.py @@ -0,0 +1,2 @@ +ASSEMBLY_AI_POLLING_INTERVAL = 10 +ASSEMBLY_AI_MAX_POLLING_ATTEMPTS = 180 diff --git a/litellm/types/scheduler.py b/litellm/types/scheduler.py new file mode 100644 index 0000000000..1b2073f257 --- /dev/null +++ b/litellm/types/scheduler.py @@ -0,0 +1,7 @@ +from enum import Enum + + +class DefaultPriorities(Enum): + High = 0 + Medium = 128 + Low = 255 diff --git a/litellm/utils.py b/litellm/utils.py index 4283cf2df1..cdee0abcd7 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -62,6 +62,16 @@ import litellm.llms.gemini from litellm.caching._internal_lru_cache import lru_cache_wrapper from litellm.caching.caching import DualCache from litellm.caching.caching_handler import CachingHandlerResponse, LLMCachingHandler +from litellm.constants import ( + DEFAULT_MAX_LRU_CACHE_SIZE, + DEFAULT_TRIM_RATIO, + FUNCTION_DEFINITION_TOKEN_COUNT, + INITIAL_RETRY_DELAY, + JITTER, + MAX_RETRY_DELAY, + MINIMUM_PROMPT_CACHE_TOKEN_COUNT, + TOOL_CHOICE_OBJECT_TOKEN_COUNT, +) from litellm.integrations.custom_guardrail import CustomGuardrail from litellm.integrations.custom_logger import CustomLogger from litellm.litellm_core_utils.core_helpers import ( @@ -1520,7 +1530,7 @@ def _select_tokenizer( return _select_tokenizer_helper(model=model) -@lru_cache(maxsize=128) +@lru_cache(maxsize=DEFAULT_MAX_LRU_CACHE_SIZE) def _select_tokenizer_helper(model: str) -> SelectTokenizerResponse: if litellm.disable_hf_tokenizer_download is True: return _return_openai_tokenizer(model) @@ -5336,15 +5346,15 @@ def _calculate_retry_after( if retry_after is not None and 0 < retry_after <= 60: return retry_after - initial_retry_delay = 0.5 - max_retry_delay = 8.0 + initial_retry_delay = INITIAL_RETRY_DELAY + max_retry_delay = MAX_RETRY_DELAY nb_retries = max_retries - remaining_retries # Apply exponential backoff, but not more than the max. sleep_seconds = min(initial_retry_delay * pow(2.0, nb_retries), max_retry_delay) # Apply some jitter, plus-or-minus half a second. - jitter = 1 - 0.25 * random.random() + jitter = JITTER * random.random() timeout = sleep_seconds * jitter return timeout if timeout >= min_timeout else min_timeout @@ -5670,7 +5680,7 @@ def shorten_message_to_fit_limit(message, tokens_needed, model: Optional[str]): def trim_messages( messages, model: Optional[str] = None, - trim_ratio: float = 0.75, + trim_ratio: float = DEFAULT_TRIM_RATIO, return_response_tokens: bool = False, max_tokens=None, ): @@ -6543,7 +6553,7 @@ def is_prompt_caching_valid_prompt( model=model, use_default_image_token_count=True, ) - return token_count >= 1024 + return token_count >= MINIMUM_PROMPT_CACHE_TOKEN_COUNT except Exception as e: verbose_logger.error(f"Error in is_prompt_caching_valid_prompt: {e}") return False diff --git a/mypy.ini b/mypy.ini index 3ce8c5fcc0..bb0e9ec871 100644 --- a/mypy.ini +++ b/mypy.ini @@ -3,6 +3,7 @@ warn_return_any = False ignore_missing_imports = True mypy_path = litellm/stubs namespace_packages = True +disable_error_code = valid-type [mypy-google.*] ignore_missing_imports = True diff --git a/tests/code_coverage_tests/ban_constant_numbers.py b/tests/code_coverage_tests/ban_constant_numbers.py new file mode 100644 index 0000000000..c23b338086 --- /dev/null +++ b/tests/code_coverage_tests/ban_constant_numbers.py @@ -0,0 +1,152 @@ +import sys +import ast +import os + +# Extremely restrictive set of allowed numbers +ALLOWED_NUMBERS = { + 0, + 1, + -1, + 2, + 10, + 100, + 1000, + 4, + 3, + 500, + 6, + 60, + 3600, + 0.75, + 7, + 1024, + 1011, + 600, + 12, + 1000000000.0, + 0.1, + 50, + 128, + 6000, + 30, + 1000000, + 5, + 15, + 25, + 10000, + 60000, + 8, + 2048, + 16000000000, + 16, + 16383, + 14, + 24, + 128000, + 0.01, + 20, +} + +# Add all standard HTTP status codes +HTTP_STATUS_CODES = { + 200, # OK + 201, # Created + 202, # Accepted + 204, # No Content + 300, # Multiple Choices + 301, # Moved Permanently + 302, # Found + 303, # See Other + 304, # Not Modified + 307, # Temporary Redirect + 308, # Permanent Redirect + 400, # Bad Request + 401, # Unauthorized + 402, # Payment Required + 403, # Forbidden + 404, # Not Found + 406, # Not Acceptable + 408, # Request Timeout + 409, # Conflict + 413, # Payload Too Large + 422, # Unprocessable Entity + 424, # Failed Dependency + 429, # Too Many Requests + 498, # Invalid Token + 499, # Client Closed Request + 500, # Internal Server Error + 501, # Not Implemented + 502, # Bad Gateway + 503, # Service Unavailable + 504, # Gateway Timeout + 520, # Web server is returning an unknown error + 522, # Connection timed out + 524, # A timeout occurred + 529, # Site is overloaded +} + +# Combine the sets +ALLOWED_NUMBERS = ALLOWED_NUMBERS.union(HTTP_STATUS_CODES) + + +class HardcodedNumberFinder(ast.NodeVisitor): + def __init__(self): + self.hardcoded_numbers = [] + + def visit_Constant(self, node): + # For Python 3.8+ + if isinstance(node.value, (int, float)) and node.value not in ALLOWED_NUMBERS: + self.hardcoded_numbers.append((node.lineno, node.value)) + self.generic_visit(node) + + def visit_Num(self, node): + # For older Python versions + if node.n not in ALLOWED_NUMBERS: + self.hardcoded_numbers.append((node.lineno, node.n)) + self.generic_visit(node) + + +def check_file(filename): + try: + with open(filename, "r") as f: + content = f.read() + + tree = ast.parse(content) + finder = HardcodedNumberFinder() + finder.visit(tree) + + if finder.hardcoded_numbers: + print(f"ERROR in {filename}: Hardcoded numbers detected:") + for line, value in finder.hardcoded_numbers: + print(f" Line {line}: {value}") + return 1 + return 0 + except SyntaxError: + print(f"Syntax error in {filename}") + return 0 + + +def main(): + exit_code = 0 + folder = "../../litellm" + ignore_files = [ + "constants.py", + "proxy_cli.py", + "token_counter.py", + "mock_functions.py", + "duration_parser.py", + "utils.py", + ] + ignore_folder = "types" + for root, dirs, files in os.walk(folder): + for filename in files: + if filename.endswith(".py") and filename not in ignore_files: + full_path = os.path.join(root, filename) + if ignore_folder in full_path: + continue + exit_code |= check_file(full_path) + sys.exit(exit_code) + + +if __name__ == "__main__": + main() diff --git a/tests/code_coverage_tests/log.txt b/tests/code_coverage_tests/log.txt new file mode 100644 index 0000000000..e69de29bb2