Squashed commit of the following: (#9709)

commit b12a9892b7
Author: Krrish Dholakia <krrishdholakia@gmail.com>
Date:   Wed Apr 2 08:09:56 2025 -0700

    fix(utils.py): don't modify openai_token_counter

commit 294de31803
Author: Krrish Dholakia <krrishdholakia@gmail.com>
Date:   Mon Mar 24 21:22:40 2025 -0700

    fix: fix linting error

commit cb6e9fbe40
Author: Krrish Dholakia <krrishdholakia@gmail.com>
Date:   Mon Mar 24 19:52:45 2025 -0700

    refactor: complete migration

commit bfc159172d
Author: Krrish Dholakia <krrishdholakia@gmail.com>
Date:   Mon Mar 24 19:09:59 2025 -0700

    refactor: refactor more constants

commit 43ffb6a558
Author: Krrish Dholakia <krrishdholakia@gmail.com>
Date:   Mon Mar 24 18:45:24 2025 -0700

    fix: test

commit 04dbe4310c
Author: Krrish Dholakia <krrishdholakia@gmail.com>
Date:   Mon Mar 24 18:28:58 2025 -0700

    refactor: refactor: move more constants into constants.py

commit 3c26284aff
Author: Krrish Dholakia <krrishdholakia@gmail.com>
Date:   Mon Mar 24 18:14:46 2025 -0700

    refactor: migrate hardcoded constants out of __init__.py

commit c11e0de69d
Author: Krrish Dholakia <krrishdholakia@gmail.com>
Date:   Mon Mar 24 18:11:21 2025 -0700

    build: migrate all constants into constants.py

commit 7882bdc787
Author: Krrish Dholakia <krrishdholakia@gmail.com>
Date:   Mon Mar 24 18:07:37 2025 -0700

    build: initial test banning hardcoded numbers in repo
This commit is contained in:
Krish Dholakia 2025-04-02 21:24:54 -07:00 committed by GitHub
parent 5a722ef18f
commit 8ee32291e0
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
51 changed files with 509 additions and 118 deletions

View file

@ -56,6 +56,9 @@ from litellm.constants import (
bedrock_embedding_models, bedrock_embedding_models,
known_tokenizer_config, known_tokenizer_config,
BEDROCK_INVOKE_PROVIDERS_LITERAL, BEDROCK_INVOKE_PROVIDERS_LITERAL,
DEFAULT_MAX_TOKENS,
DEFAULT_SOFT_BUDGET,
DEFAULT_ALLOWED_FAILS,
) )
from litellm.types.guardrails import GuardrailItem from litellm.types.guardrails import GuardrailItem
from litellm.proxy._types import ( from litellm.proxy._types import (
@ -155,7 +158,7 @@ token: Optional[
str str
] = None # Not used anymore, will be removed in next MAJOR release - https://github.com/BerriAI/litellm/discussions/648 ] = None # Not used anymore, will be removed in next MAJOR release - https://github.com/BerriAI/litellm/discussions/648
telemetry = True telemetry = True
max_tokens = 256 # OpenAI Defaults max_tokens: int = DEFAULT_MAX_TOKENS # OpenAI Defaults
drop_params = bool(os.getenv("LITELLM_DROP_PARAMS", False)) drop_params = bool(os.getenv("LITELLM_DROP_PARAMS", False))
modify_params = False modify_params = False
retry = True retry = True
@ -244,7 +247,7 @@ budget_duration: Optional[
str str
] = None # proxy only - resets budget after fixed duration. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d"). ] = None # proxy only - resets budget after fixed duration. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
default_soft_budget: float = ( default_soft_budget: float = (
50.0 # by default all litellm proxy keys have a soft budget of 50.0 DEFAULT_SOFT_BUDGET # by default all litellm proxy keys have a soft budget of 50.0
) )
forward_traceparent_to_llm_provider: bool = False forward_traceparent_to_llm_provider: bool = False

View file

@ -18,6 +18,7 @@ import redis # type: ignore
import redis.asyncio as async_redis # type: ignore import redis.asyncio as async_redis # type: ignore
from litellm import get_secret, get_secret_str from litellm import get_secret, get_secret_str
from litellm.constants import REDIS_CONNECTION_POOL_TIMEOUT, REDIS_SOCKET_TIMEOUT
from ._logging import verbose_logger from ._logging import verbose_logger
@ -215,7 +216,7 @@ def _init_redis_sentinel(redis_kwargs) -> redis.Redis:
# Set up the Sentinel client # Set up the Sentinel client
sentinel = redis.Sentinel( sentinel = redis.Sentinel(
sentinel_nodes, sentinel_nodes,
socket_timeout=0.1, socket_timeout=REDIS_SOCKET_TIMEOUT,
password=sentinel_password, password=sentinel_password,
) )
@ -239,7 +240,7 @@ def _init_async_redis_sentinel(redis_kwargs) -> async_redis.Redis:
# Set up the Sentinel client # Set up the Sentinel client
sentinel = async_redis.Sentinel( sentinel = async_redis.Sentinel(
sentinel_nodes, sentinel_nodes,
socket_timeout=0.1, socket_timeout=REDIS_SOCKET_TIMEOUT,
password=sentinel_password, password=sentinel_password,
) )
@ -319,7 +320,7 @@ def get_redis_connection_pool(**env_overrides):
verbose_logger.debug("get_redis_connection_pool: redis_kwargs", redis_kwargs) verbose_logger.debug("get_redis_connection_pool: redis_kwargs", redis_kwargs)
if "url" in redis_kwargs and redis_kwargs["url"] is not None: if "url" in redis_kwargs and redis_kwargs["url"] is not None:
return async_redis.BlockingConnectionPool.from_url( return async_redis.BlockingConnectionPool.from_url(
timeout=5, url=redis_kwargs["url"] timeout=REDIS_CONNECTION_POOL_TIMEOUT, url=redis_kwargs["url"]
) )
connection_class = async_redis.Connection connection_class = async_redis.Connection
if "ssl" in redis_kwargs: if "ssl" in redis_kwargs:
@ -327,4 +328,6 @@ def get_redis_connection_pool(**env_overrides):
redis_kwargs.pop("ssl", None) redis_kwargs.pop("ssl", None)
redis_kwargs["connection_class"] = connection_class redis_kwargs["connection_class"] = connection_class
redis_kwargs.pop("startup_nodes", None) redis_kwargs.pop("startup_nodes", None)
return async_redis.BlockingConnectionPool(timeout=5, **redis_kwargs) return async_redis.BlockingConnectionPool(
timeout=REDIS_CONNECTION_POOL_TIMEOUT, **redis_kwargs
)

View file

@ -14,6 +14,12 @@ import time
from typing import Literal, Optional from typing import Literal, Optional
import litellm import litellm
from litellm.constants import (
DAYS_IN_A_MONTH,
DAYS_IN_A_WEEK,
DAYS_IN_A_YEAR,
HOURS_IN_A_DAY,
)
from litellm.utils import ModelResponse from litellm.utils import ModelResponse
@ -81,11 +87,11 @@ class BudgetManager:
if duration == "daily": if duration == "daily":
duration_in_days = 1 duration_in_days = 1
elif duration == "weekly": elif duration == "weekly":
duration_in_days = 7 duration_in_days = DAYS_IN_A_WEEK
elif duration == "monthly": elif duration == "monthly":
duration_in_days = 28 duration_in_days = DAYS_IN_A_MONTH
elif duration == "yearly": elif duration == "yearly":
duration_in_days = 365 duration_in_days = DAYS_IN_A_YEAR
else: else:
raise ValueError( raise ValueError(
"""duration needs to be one of ["daily", "weekly", "monthly", "yearly"]""" """duration needs to be one of ["daily", "weekly", "monthly", "yearly"]"""
@ -182,7 +188,9 @@ class BudgetManager:
current_time = time.time() current_time = time.time()
# Convert duration from days to seconds # Convert duration from days to seconds
duration_in_seconds = self.user_dict[user]["duration"] * 24 * 60 * 60 duration_in_seconds = (
self.user_dict[user]["duration"] * HOURS_IN_A_DAY * 60 * 60
)
# Check if duration has elapsed # Check if duration has elapsed
if current_time - last_updated_at >= duration_in_seconds: if current_time - last_updated_at >= duration_in_seconds:

View file

@ -19,6 +19,7 @@ from pydantic import BaseModel
import litellm import litellm
from litellm._logging import verbose_logger from litellm._logging import verbose_logger
from litellm.constants import CACHED_STREAMING_CHUNK_DELAY
from litellm.litellm_core_utils.model_param_helper import ModelParamHelper from litellm.litellm_core_utils.model_param_helper import ModelParamHelper
from litellm.types.caching import * from litellm.types.caching import *
from litellm.types.utils import all_litellm_params from litellm.types.utils import all_litellm_params
@ -406,7 +407,7 @@ class Cache:
} }
] ]
} }
time.sleep(0.02) time.sleep(CACHED_STREAMING_CHUNK_DELAY)
def _get_cache_logic( def _get_cache_logic(
self, self,

View file

@ -15,7 +15,8 @@ from typing import Any, List, Optional
from pydantic import BaseModel from pydantic import BaseModel
from ..constants import MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB from litellm.constants import MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB
from .base_cache import BaseCache from .base_cache import BaseCache
@ -52,7 +53,8 @@ class InMemoryCache(BaseCache):
# Fast path for common primitive types that are typically small # Fast path for common primitive types that are typically small
if ( if (
isinstance(value, (bool, int, float, str)) isinstance(value, (bool, int, float, str))
and len(str(value)) < self.max_size_per_item * 512 and len(str(value))
< self.max_size_per_item * MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB
): # Conservative estimate ): # Conservative estimate
return True return True

View file

@ -11,10 +11,12 @@ Has 4 methods:
import ast import ast
import asyncio import asyncio
import json import json
from typing import Any from typing import Any, cast
import litellm import litellm
from litellm._logging import print_verbose from litellm._logging import print_verbose
from litellm.constants import QDRANT_SCALAR_QUANTILE, QDRANT_VECTOR_SIZE
from litellm.types.utils import EmbeddingResponse
from .base_cache import BaseCache from .base_cache import BaseCache
@ -118,7 +120,11 @@ class QdrantSemanticCache(BaseCache):
} }
elif quantization_config == "scalar": elif quantization_config == "scalar":
quantization_params = { quantization_params = {
"scalar": {"type": "int8", "quantile": 0.99, "always_ram": False} "scalar": {
"type": "int8",
"quantile": QDRANT_SCALAR_QUANTILE,
"always_ram": False,
}
} }
elif quantization_config == "product": elif quantization_config == "product":
quantization_params = { quantization_params = {
@ -132,7 +138,7 @@ class QdrantSemanticCache(BaseCache):
new_collection_status = self.sync_client.put( new_collection_status = self.sync_client.put(
url=f"{self.qdrant_api_base}/collections/{self.collection_name}", url=f"{self.qdrant_api_base}/collections/{self.collection_name}",
json={ json={
"vectors": {"size": 1536, "distance": "Cosine"}, "vectors": {"size": QDRANT_VECTOR_SIZE, "distance": "Cosine"},
"quantization_config": quantization_params, "quantization_config": quantization_params,
}, },
headers=self.headers, headers=self.headers,
@ -171,10 +177,13 @@ class QdrantSemanticCache(BaseCache):
prompt += message["content"] prompt += message["content"]
# create an embedding for prompt # create an embedding for prompt
embedding_response = litellm.embedding( embedding_response = cast(
EmbeddingResponse,
litellm.embedding(
model=self.embedding_model, model=self.embedding_model,
input=prompt, input=prompt,
cache={"no-store": True, "no-cache": True}, cache={"no-store": True, "no-cache": True},
),
) )
# get the embedding # get the embedding
@ -212,10 +221,13 @@ class QdrantSemanticCache(BaseCache):
prompt += message["content"] prompt += message["content"]
# convert to embedding # convert to embedding
embedding_response = litellm.embedding( embedding_response = cast(
EmbeddingResponse,
litellm.embedding(
model=self.embedding_model, model=self.embedding_model,
input=prompt, input=prompt,
cache={"no-store": True, "no-cache": True}, cache={"no-store": True, "no-cache": True},
),
) )
# get the embedding # get the embedding

View file

@ -9,6 +9,7 @@ DEFAULT_FAILURE_THRESHOLD_PERCENT = (
0.5 # default cooldown a deployment if 50% of requests fail in a given minute 0.5 # default cooldown a deployment if 50% of requests fail in a given minute
) )
DEFAULT_MAX_TOKENS = 4096 DEFAULT_MAX_TOKENS = 4096
DEFAULT_ALLOWED_FAILS = 3
DEFAULT_REDIS_SYNC_INTERVAL = 1 DEFAULT_REDIS_SYNC_INTERVAL = 1
DEFAULT_COOLDOWN_TIME_SECONDS = 5 DEFAULT_COOLDOWN_TIME_SECONDS = 5
DEFAULT_REPLICATE_POLLING_RETRIES = 5 DEFAULT_REPLICATE_POLLING_RETRIES = 5
@ -16,16 +17,71 @@ DEFAULT_REPLICATE_POLLING_DELAY_SECONDS = 1
DEFAULT_IMAGE_TOKEN_COUNT = 250 DEFAULT_IMAGE_TOKEN_COUNT = 250
DEFAULT_IMAGE_WIDTH = 300 DEFAULT_IMAGE_WIDTH = 300
DEFAULT_IMAGE_HEIGHT = 300 DEFAULT_IMAGE_HEIGHT = 300
DEFAULT_MAX_TOKENS = 256 # used when providers need a default
MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB = 1024 # 1MB = 1024KB MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB = 1024 # 1MB = 1024KB
SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD = 1000 # Minimum number of requests to consider "reasonable traffic". Used for single-deployment cooldown logic. SINGLE_DEPLOYMENT_TRAFFIC_FAILURE_THRESHOLD = 1000 # Minimum number of requests to consider "reasonable traffic". Used for single-deployment cooldown logic.
REDIS_UPDATE_BUFFER_KEY = "litellm_spend_update_buffer" REDIS_UPDATE_BUFFER_KEY = "litellm_spend_update_buffer"
REDIS_DAILY_SPEND_UPDATE_BUFFER_KEY = "litellm_daily_spend_update_buffer" REDIS_DAILY_SPEND_UPDATE_BUFFER_KEY = "litellm_daily_spend_update_buffer"
MAX_REDIS_BUFFER_DEQUEUE_COUNT = 100 MAX_REDIS_BUFFER_DEQUEUE_COUNT = 100
MINIMUM_PROMPT_CACHE_TOKEN_COUNT = (
1024 # minimum number of tokens to cache a prompt by Anthropic
)
DEFAULT_TRIM_RATIO = 0.75 # default ratio of tokens to trim from the end of a prompt
HOURS_IN_A_DAY = 24
DAYS_IN_A_WEEK = 7
DAYS_IN_A_MONTH = 28
DAYS_IN_A_YEAR = 365
REPLICATE_MODEL_NAME_WITH_ID_LENGTH = 64
#### TOKEN COUNTING ####
FUNCTION_DEFINITION_TOKEN_COUNT = 9
SYSTEM_MESSAGE_TOKEN_COUNT = 4
TOOL_CHOICE_OBJECT_TOKEN_COUNT = 4
DEFAULT_MOCK_RESPONSE_PROMPT_TOKEN_COUNT = 10
DEFAULT_MOCK_RESPONSE_COMPLETION_TOKEN_COUNT = 20
MAX_SHORT_SIDE_FOR_IMAGE_HIGH_RES = 768
MAX_LONG_SIDE_FOR_IMAGE_HIGH_RES = 2000
MAX_TILE_WIDTH = 512
MAX_TILE_HEIGHT = 512
OPENAI_FILE_SEARCH_COST_PER_1K_CALLS = 2.5 / 1000
MIN_NON_ZERO_TEMPERATURE = 0.0001
#### RELIABILITY #### #### RELIABILITY ####
REPEATED_STREAMING_CHUNK_LIMIT = 100 # catch if model starts looping the same chunk while streaming. Uses high default to prevent false positives. REPEATED_STREAMING_CHUNK_LIMIT = 100 # catch if model starts looping the same chunk while streaming. Uses high default to prevent false positives.
DEFAULT_MAX_LRU_CACHE_SIZE = 16
INITIAL_RETRY_DELAY = 0.5
MAX_RETRY_DELAY = 8.0
JITTER = 0.75
DEFAULT_IN_MEMORY_TTL = 5 # default time to live for the in-memory cache
DEFAULT_POLLING_INTERVAL = 0.03 # default polling interval for the scheduler
AZURE_OPERATION_POLLING_TIMEOUT = 120
REDIS_SOCKET_TIMEOUT = 0.1
REDIS_CONNECTION_POOL_TIMEOUT = 5
NON_LLM_CONNECTION_TIMEOUT = 15 # timeout for adjacent services (e.g. jwt auth)
MAX_EXCEPTION_MESSAGE_LENGTH = 2000
BEDROCK_MAX_POLICY_SIZE = 75
REPLICATE_POLLING_DELAY_SECONDS = 0.5
DEFAULT_ANTHROPIC_CHAT_MAX_TOKENS = 4096
TOGETHER_AI_4_B = 4
TOGETHER_AI_8_B = 8
TOGETHER_AI_21_B = 21
TOGETHER_AI_41_B = 41
TOGETHER_AI_80_B = 80
TOGETHER_AI_110_B = 110
TOGETHER_AI_EMBEDDING_150_M = 150
TOGETHER_AI_EMBEDDING_350_M = 350
QDRANT_SCALAR_QUANTILE = 0.99
QDRANT_VECTOR_SIZE = 1536
CACHED_STREAMING_CHUNK_DELAY = 0.02
MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB = 512
DEFAULT_MAX_TOKENS_FOR_TRITON = 2000
#### Networking settings #### #### Networking settings ####
request_timeout: float = 6000 # time in seconds request_timeout: float = 6000 # time in seconds
STREAM_SSE_DONE_STRING: str = "[DONE]" STREAM_SSE_DONE_STRING: str = "[DONE]"
### SPEND TRACKING ###
DEFAULT_REPLICATE_GPU_PRICE_PER_SECOND = 0.001400 # price per second for a100 80GB
FIREWORKS_AI_56_B_MOE = 56
FIREWORKS_AI_176_B_MOE = 176
FIREWORKS_AI_16_B = 16
FIREWORKS_AI_80_B = 80
LITELLM_CHAT_PROVIDERS = [ LITELLM_CHAT_PROVIDERS = [
"openai", "openai",
@ -426,6 +482,9 @@ MCP_TOOL_NAME_PREFIX = "mcp_tool"
MAX_SPENDLOG_ROWS_TO_QUERY = ( MAX_SPENDLOG_ROWS_TO_QUERY = (
1_000_000 # if spendLogs has more than 1M rows, do not query the DB 1_000_000 # if spendLogs has more than 1M rows, do not query the DB
) )
DEFAULT_SOFT_BUDGET = (
50.0 # by default all litellm proxy keys have a soft budget of 50.0
)
# makes it clear this is a rate limit error for a litellm virtual key # makes it clear this is a rate limit error for a litellm virtual key
RATE_LIMIT_ERROR_MESSAGE_FOR_VIRTUAL_KEY = "LiteLLM Virtual Key user_api_key_hash" RATE_LIMIT_ERROR_MESSAGE_FOR_VIRTUAL_KEY = "LiteLLM Virtual Key user_api_key_hash"
@ -451,3 +510,14 @@ LITELLM_PROXY_ADMIN_NAME = "default_user_id"
########################### DB CRON JOB NAMES ########################### ########################### DB CRON JOB NAMES ###########################
DB_SPEND_UPDATE_JOB_NAME = "db_spend_update_job" DB_SPEND_UPDATE_JOB_NAME = "db_spend_update_job"
DEFAULT_CRON_JOB_LOCK_TTL_SECONDS = 60 # 1 minute DEFAULT_CRON_JOB_LOCK_TTL_SECONDS = 60 # 1 minute
PROXY_BUDGET_RESCHEDULER_MIN_TIME = 597
PROXY_BUDGET_RESCHEDULER_MAX_TIME = 605
PROXY_BATCH_WRITE_AT = 10 # in seconds
DEFAULT_HEALTH_CHECK_INTERVAL = 300 # 5 minutes
PROMETHEUS_FALLBACK_STATS_SEND_TIME_HOURS = 9
DEFAULT_MODEL_CREATED_AT_TIME = 1677610602 # returns on `/models` endpoint
DEFAULT_SLACK_ALERTING_THRESHOLD = 300
MAX_TEAM_LIST_LIMIT = 20
DEFAULT_PROMPT_INJECTION_SIMILARITY_THRESHOLD = 0.7
LENGTH_OF_LITELLM_GENERATED_KEY = 16
SECRET_MANAGER_REFRESH_INTERVAL = 86400

View file

@ -9,6 +9,10 @@ from pydantic import BaseModel
import litellm import litellm
import litellm._logging import litellm._logging
from litellm import verbose_logger from litellm import verbose_logger
from litellm.constants import (
DEFAULT_MAX_LRU_CACHE_SIZE,
DEFAULT_REPLICATE_GPU_PRICE_PER_SECOND,
)
from litellm.litellm_core_utils.llm_cost_calc.tool_call_cost_tracking import ( from litellm.litellm_core_utils.llm_cost_calc.tool_call_cost_tracking import (
StandardBuiltInToolCostTracking, StandardBuiltInToolCostTracking,
) )
@ -355,9 +359,7 @@ def cost_per_token( # noqa: PLR0915
def get_replicate_completion_pricing(completion_response: dict, total_time=0.0): def get_replicate_completion_pricing(completion_response: dict, total_time=0.0):
# see https://replicate.com/pricing # see https://replicate.com/pricing
# for all litellm currently supported LLMs, almost all requests go to a100_80gb # for all litellm currently supported LLMs, almost all requests go to a100_80gb
a100_80gb_price_per_second_public = ( a100_80gb_price_per_second_public = DEFAULT_REPLICATE_GPU_PRICE_PER_SECOND # assume all calls sent to A100 80GB for now
0.001400 # assume all calls sent to A100 80GB for now
)
if total_time == 0.0: # total time is in ms if total_time == 0.0: # total time is in ms
start_time = completion_response.get("created", time.time()) start_time = completion_response.get("created", time.time())
end_time = getattr(completion_response, "ended", time.time()) end_time = getattr(completion_response, "ended", time.time())
@ -450,7 +452,7 @@ def _select_model_name_for_cost_calc(
return return_model return return_model
@lru_cache(maxsize=16) @lru_cache(maxsize=DEFAULT_MAX_LRU_CACHE_SIZE)
def _model_contains_known_llm_provider(model: str) -> bool: def _model_contains_known_llm_provider(model: str) -> bool:
""" """
Check if the model contains a known llm provider Check if the model contains a known llm provider

View file

@ -16,6 +16,7 @@ import litellm.litellm_core_utils.litellm_logging
import litellm.types import litellm.types
from litellm._logging import verbose_logger, verbose_proxy_logger from litellm._logging import verbose_logger, verbose_proxy_logger
from litellm.caching.caching import DualCache from litellm.caching.caching import DualCache
from litellm.constants import HOURS_IN_A_DAY
from litellm.integrations.custom_batch_logger import CustomBatchLogger from litellm.integrations.custom_batch_logger import CustomBatchLogger
from litellm.litellm_core_utils.duration_parser import duration_in_seconds from litellm.litellm_core_utils.duration_parser import duration_in_seconds
from litellm.litellm_core_utils.exception_mapping_utils import ( from litellm.litellm_core_utils.exception_mapping_utils import (
@ -649,10 +650,10 @@ class SlackAlerting(CustomBatchLogger):
event_message += ( event_message += (
f"Budget Crossed\n Total Budget:`{user_info.max_budget}`" f"Budget Crossed\n Total Budget:`{user_info.max_budget}`"
) )
elif percent_left <= 0.05: elif percent_left <= SLACK_ALERTING_THRESHOLD_5_PERCENT:
event = "threshold_crossed" event = "threshold_crossed"
event_message += "5% Threshold Crossed " event_message += "5% Threshold Crossed "
elif percent_left <= 0.15: elif percent_left <= SLACK_ALERTING_THRESHOLD_15_PERCENT:
event = "threshold_crossed" event = "threshold_crossed"
event_message += "15% Threshold Crossed" event_message += "15% Threshold Crossed"
elif user_info.soft_budget is not None: elif user_info.soft_budget is not None:
@ -1718,7 +1719,7 @@ Model Info:
await self.internal_usage_cache.async_set_cache( await self.internal_usage_cache.async_set_cache(
key=_event_cache_key, key=_event_cache_key,
value="SENT", value="SENT",
ttl=(30 * 24 * 60 * 60), # 1 month ttl=(30 * HOURS_IN_A_DAY * 60 * 60), # 1 month
) )
except Exception as e: except Exception as e:

View file

@ -41,7 +41,7 @@ from litellm.types.utils import StandardLoggingPayload
from ..additional_logging_utils import AdditionalLoggingUtils from ..additional_logging_utils import AdditionalLoggingUtils
# max number of logs DD API can accept # max number of logs DD API can accept
DD_MAX_BATCH_SIZE = 1000
# specify what ServiceTypes are logged as success events to DD. (We don't want to spam DD traces with large number of service types) # specify what ServiceTypes are logged as success events to DD. (We don't want to spam DD traces with large number of service types)
DD_LOGGED_SUCCESS_SERVICE_TYPES = [ DD_LOGGED_SUCCESS_SERVICE_TYPES = [

View file

@ -20,10 +20,6 @@ else:
VertexBase = Any VertexBase = Any
GCS_DEFAULT_BATCH_SIZE = 2048
GCS_DEFAULT_FLUSH_INTERVAL_SECONDS = 20
class GCSBucketLogger(GCSBucketBase, AdditionalLoggingUtils): class GCSBucketLogger(GCSBucketBase, AdditionalLoggingUtils):
def __init__(self, bucket_name: Optional[str] = None) -> None: def __init__(self, bucket_name: Optional[str] = None) -> None:
from litellm.proxy.proxy_server import premium_user from litellm.proxy.proxy_server import premium_user

View file

@ -3,6 +3,7 @@ from typing import Optional, Tuple
import httpx import httpx
import litellm import litellm
from litellm.constants import REPLICATE_MODEL_NAME_WITH_ID_LENGTH
from litellm.secret_managers.main import get_secret, get_secret_str from litellm.secret_managers.main import get_secret, get_secret_str
from ..types.router import LiteLLM_Params from ..types.router import LiteLLM_Params
@ -256,10 +257,13 @@ def get_llm_provider( # noqa: PLR0915
elif model in litellm.cohere_chat_models: elif model in litellm.cohere_chat_models:
custom_llm_provider = "cohere_chat" custom_llm_provider = "cohere_chat"
## replicate ## replicate
elif model in litellm.replicate_models or (":" in model and len(model) > 64): elif model in litellm.replicate_models or (
":" in model and len(model) > REPLICATE_MODEL_NAME_WITH_ID_LENGTH
):
model_parts = model.split(":") model_parts = model.split(":")
if ( if (
len(model_parts) > 1 and len(model_parts[1]) == 64 len(model_parts) > 1
and len(model_parts[1]) == REPLICATE_MODEL_NAME_WITH_ID_LENGTH
): ## checks if model name has a 64 digit code - e.g. "meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3" ): ## checks if model name has a 64 digit code - e.g. "meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3"
custom_llm_provider = "replicate" custom_llm_provider = "replicate"
elif model in litellm.replicate_models: elif model in litellm.replicate_models:

View file

@ -28,6 +28,10 @@ from litellm._logging import _is_debugging_on, verbose_logger
from litellm.batches.batch_utils import _handle_completed_batch from litellm.batches.batch_utils import _handle_completed_batch
from litellm.caching.caching import DualCache, InMemoryCache from litellm.caching.caching import DualCache, InMemoryCache
from litellm.caching.caching_handler import LLMCachingHandler from litellm.caching.caching_handler import LLMCachingHandler
from litellm.constants import (
DEFAULT_MOCK_RESPONSE_COMPLETION_TOKEN_COUNT,
DEFAULT_MOCK_RESPONSE_PROMPT_TOKEN_COUNT,
)
from litellm.cost_calculator import _select_model_name_for_cost_calc from litellm.cost_calculator import _select_model_name_for_cost_calc
from litellm.integrations.arize.arize import ArizeLogger from litellm.integrations.arize.arize import ArizeLogger
from litellm.integrations.custom_guardrail import CustomGuardrail from litellm.integrations.custom_guardrail import CustomGuardrail
@ -3745,9 +3749,12 @@ def create_dummy_standard_logging_payload() -> StandardLoggingPayload:
response_cost=response_cost, response_cost=response_cost,
response_cost_failure_debug_info=None, response_cost_failure_debug_info=None,
status=str("success"), status=str("success"),
total_tokens=int(30), total_tokens=int(
prompt_tokens=int(20), DEFAULT_MOCK_RESPONSE_PROMPT_TOKEN_COUNT
completion_tokens=int(10), + DEFAULT_MOCK_RESPONSE_COMPLETION_TOKEN_COUNT
),
prompt_tokens=int(DEFAULT_MOCK_RESPONSE_PROMPT_TOKEN_COUNT),
completion_tokens=int(DEFAULT_MOCK_RESPONSE_COMPLETION_TOKEN_COUNT),
startTime=start_time, startTime=start_time,
endTime=end_time, endTime=end_time,
completionStartTime=completion_start_time, completionStartTime=completion_start_time,

View file

@ -5,6 +5,7 @@ Helper utilities for tracking the cost of built-in tools.
from typing import Any, Dict, List, Optional from typing import Any, Dict, List, Optional
import litellm import litellm
from litellm.constants import OPENAI_FILE_SEARCH_COST_PER_1K_CALLS
from litellm.types.llms.openai import FileSearchTool, WebSearchOptions from litellm.types.llms.openai import FileSearchTool, WebSearchOptions
from litellm.types.utils import ( from litellm.types.utils import (
ModelInfo, ModelInfo,
@ -132,7 +133,7 @@ class StandardBuiltInToolCostTracking:
""" """
if file_search is None: if file_search is None:
return 0.0 return 0.0
return 2.5 / 1000 return OPENAI_FILE_SEARCH_COST_PER_1K_CALLS
@staticmethod @staticmethod
def chat_completion_response_includes_annotations( def chat_completion_response_includes_annotations(

View file

@ -11,6 +11,10 @@ from litellm.constants import (
DEFAULT_IMAGE_HEIGHT, DEFAULT_IMAGE_HEIGHT,
DEFAULT_IMAGE_TOKEN_COUNT, DEFAULT_IMAGE_TOKEN_COUNT,
DEFAULT_IMAGE_WIDTH, DEFAULT_IMAGE_WIDTH,
MAX_LONG_SIDE_FOR_IMAGE_HIGH_RES,
MAX_SHORT_SIDE_FOR_IMAGE_HIGH_RES,
MAX_TILE_HEIGHT,
MAX_TILE_WIDTH,
) )
from litellm.llms.custom_httpx.http_handler import _get_httpx_client from litellm.llms.custom_httpx.http_handler import _get_httpx_client
@ -97,11 +101,14 @@ def resize_image_high_res(
height: int, height: int,
) -> Tuple[int, int]: ) -> Tuple[int, int]:
# Maximum dimensions for high res mode # Maximum dimensions for high res mode
max_short_side = 768 max_short_side = MAX_SHORT_SIDE_FOR_IMAGE_HIGH_RES
max_long_side = 2000 max_long_side = MAX_LONG_SIDE_FOR_IMAGE_HIGH_RES
# Return early if no resizing is needed # Return early if no resizing is needed
if width <= 768 and height <= 768: if (
width <= MAX_SHORT_SIDE_FOR_IMAGE_HIGH_RES
and height <= MAX_SHORT_SIDE_FOR_IMAGE_HIGH_RES
):
return width, height return width, height
# Determine the longer and shorter sides # Determine the longer and shorter sides
@ -132,7 +139,10 @@ def resize_image_high_res(
# Test the function with the given example # Test the function with the given example
def calculate_tiles_needed( def calculate_tiles_needed(
resized_width, resized_height, tile_width=512, tile_height=512 resized_width,
resized_height,
tile_width=MAX_TILE_WIDTH,
tile_height=MAX_TILE_HEIGHT,
): ):
tiles_across = (resized_width + tile_width - 1) // tile_width tiles_across = (resized_width + tile_width - 1) // tile_width
tiles_down = (resized_height + tile_height - 1) // tile_height tiles_down = (resized_height + tile_height - 1) // tile_height

View file

@ -5,7 +5,10 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union, cast
import httpx import httpx
import litellm import litellm
from litellm.constants import RESPONSE_FORMAT_TOOL_NAME from litellm.constants import (
DEFAULT_ANTHROPIC_CHAT_MAX_TOKENS,
RESPONSE_FORMAT_TOOL_NAME,
)
from litellm.litellm_core_utils.core_helpers import map_finish_reason from litellm.litellm_core_utils.core_helpers import map_finish_reason
from litellm.litellm_core_utils.prompt_templates.factory import anthropic_messages_pt from litellm.litellm_core_utils.prompt_templates.factory import anthropic_messages_pt
from litellm.llms.base_llm.base_utils import type_to_response_format_param from litellm.llms.base_llm.base_utils import type_to_response_format_param
@ -53,7 +56,7 @@ class AnthropicConfig(BaseConfig):
max_tokens: Optional[ max_tokens: Optional[
int int
] = 4096 # anthropic requires a default value (Opus, Sonnet, and Haiku have the same default) ] = DEFAULT_ANTHROPIC_CHAT_MAX_TOKENS # anthropic requires a default value (Opus, Sonnet, and Haiku have the same default)
stop_sequences: Optional[list] = None stop_sequences: Optional[list] = None
temperature: Optional[int] = None temperature: Optional[int] = None
top_p: Optional[int] = None top_p: Optional[int] = None
@ -65,7 +68,7 @@ class AnthropicConfig(BaseConfig):
self, self,
max_tokens: Optional[ max_tokens: Optional[
int int
] = 4096, # You can pass in a value yourself or use the default value 4096 ] = DEFAULT_ANTHROPIC_CHAT_MAX_TOKENS, # You can pass in a value yourself or use the default value 4096
stop_sequences: Optional[list] = None, stop_sequences: Optional[list] = None,
temperature: Optional[int] = None, temperature: Optional[int] = None,
top_p: Optional[int] = None, top_p: Optional[int] = None,

View file

@ -11,6 +11,7 @@ from typing import AsyncIterator, Dict, Iterator, List, Optional, Union
import httpx import httpx
import litellm import litellm
from litellm.constants import DEFAULT_MAX_TOKENS
from litellm.litellm_core_utils.prompt_templates.factory import ( from litellm.litellm_core_utils.prompt_templates.factory import (
custom_prompt, custom_prompt,
prompt_factory, prompt_factory,
@ -65,7 +66,9 @@ class AnthropicTextConfig(BaseConfig):
def __init__( def __init__(
self, self,
max_tokens_to_sample: Optional[int] = 256, # anthropic requires a default max_tokens_to_sample: Optional[
int
] = DEFAULT_MAX_TOKENS, # anthropic requires a default
stop_sequences: Optional[list] = None, stop_sequences: Optional[list] = None,
temperature: Optional[int] = None, temperature: Optional[int] = None,
top_p: Optional[int] = None, top_p: Optional[int] = None,

View file

@ -7,7 +7,7 @@ import httpx # type: ignore
from openai import APITimeoutError, AsyncAzureOpenAI, AzureOpenAI from openai import APITimeoutError, AsyncAzureOpenAI, AzureOpenAI
import litellm import litellm
from litellm.constants import DEFAULT_MAX_RETRIES from litellm.constants import AZURE_OPERATION_POLLING_TIMEOUT, DEFAULT_MAX_RETRIES
from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
from litellm.litellm_core_utils.logging_utils import track_llm_api_timing from litellm.litellm_core_utils.logging_utils import track_llm_api_timing
from litellm.llms.custom_httpx.http_handler import ( from litellm.llms.custom_httpx.http_handler import (
@ -857,7 +857,7 @@ class AzureChatCompletion(BaseAzureLLM, BaseLLM):
await response.aread() await response.aread()
timeout_secs: int = 120 timeout_secs: int = AZURE_OPERATION_POLLING_TIMEOUT
start_time = time.time() start_time = time.time()
if "status" not in response.json(): if "status" not in response.json():
raise Exception( raise Exception(
@ -955,7 +955,7 @@ class AzureChatCompletion(BaseAzureLLM, BaseLLM):
response.read() response.read()
timeout_secs: int = 120 timeout_secs: int = AZURE_OPERATION_POLLING_TIMEOUT
start_time = time.time() start_time = time.time()
if "status" not in response.json(): if "status" not in response.json():
raise Exception( raise Exception(

View file

@ -7,6 +7,10 @@ from litellm.litellm_core_utils.prompt_templates.factory import (
convert_to_azure_openai_messages, convert_to_azure_openai_messages,
) )
from litellm.llms.base_llm.chat.transformation import BaseLLMException from litellm.llms.base_llm.chat.transformation import BaseLLMException
from litellm.types.llms.azure import (
API_VERSION_MONTH_SUPPORTED_RESPONSE_FORMAT,
API_VERSION_YEAR_SUPPORTED_RESPONSE_FORMAT,
)
from litellm.types.utils import ModelResponse from litellm.types.utils import ModelResponse
from litellm.utils import supports_response_schema from litellm.utils import supports_response_schema
@ -123,7 +127,10 @@ class AzureOpenAIConfig(BaseConfig):
- check if api_version is supported for response_format - check if api_version is supported for response_format
""" """
is_supported = int(api_version_year) <= 2024 and int(api_version_month) >= 8 is_supported = (
int(api_version_year) <= API_VERSION_YEAR_SUPPORTED_RESPONSE_FORMAT
and int(api_version_month) >= API_VERSION_MONTH_SUPPORTED_RESPONSE_FORMAT
)
return is_supported return is_supported

View file

@ -9,7 +9,7 @@ from pydantic import BaseModel
from litellm._logging import verbose_logger from litellm._logging import verbose_logger
from litellm.caching.caching import DualCache from litellm.caching.caching import DualCache
from litellm.constants import BEDROCK_INVOKE_PROVIDERS_LITERAL from litellm.constants import BEDROCK_INVOKE_PROVIDERS_LITERAL, BEDROCK_MAX_POLICY_SIZE
from litellm.litellm_core_utils.dd_tracing import tracer from litellm.litellm_core_utils.dd_tracing import tracer
from litellm.secret_managers.main import get_secret from litellm.secret_managers.main import get_secret
@ -381,7 +381,7 @@ class BaseAWSLLM:
"region_name": aws_region_name, "region_name": aws_region_name,
} }
if sts_response["PackedPolicySize"] > 75: if sts_response["PackedPolicySize"] > BEDROCK_MAX_POLICY_SIZE:
verbose_logger.warning( verbose_logger.warning(
f"The policy size is greater than 75% of the allowed size, PackedPolicySize: {sts_response['PackedPolicySize']}" f"The policy size is greater than 75% of the allowed size, PackedPolicySize: {sts_response['PackedPolicySize']}"
) )

View file

@ -1,6 +1,7 @@
from typing import Optional, Tuple, Union from typing import Optional, Tuple, Union
import litellm import litellm
from litellm.constants import MIN_NON_ZERO_TEMPERATURE
from litellm.llms.openai.chat.gpt_transformation import OpenAIGPTConfig from litellm.llms.openai.chat.gpt_transformation import OpenAIGPTConfig
from litellm.secret_managers.main import get_secret_str from litellm.secret_managers.main import get_secret_str
@ -84,7 +85,7 @@ class DeepInfraConfig(OpenAIGPTConfig):
and value == 0 and value == 0
and model == "mistralai/Mistral-7B-Instruct-v0.1" and model == "mistralai/Mistral-7B-Instruct-v0.1"
): # this model does no support temperature == 0 ): # this model does no support temperature == 0
value = 0.0001 # close to 0 value = MIN_NON_ZERO_TEMPERATURE # close to 0
if param == "tool_choice": if param == "tool_choice":
if ( if (
value != "auto" and value != "none" value != "auto" and value != "none"

View file

@ -4,6 +4,12 @@ For calculating cost of fireworks ai serverless inference models.
from typing import Tuple from typing import Tuple
from litellm.constants import (
FIREWORKS_AI_16_B,
FIREWORKS_AI_56_B_MOE,
FIREWORKS_AI_80_B,
FIREWORKS_AI_176_B_MOE,
)
from litellm.types.utils import Usage from litellm.types.utils import Usage
from litellm.utils import get_model_info from litellm.utils import get_model_info
@ -25,9 +31,9 @@ def get_base_model_for_pricing(model_name: str) -> str:
moe_match = re.search(r"(\d+)x(\d+)b", model_name) moe_match = re.search(r"(\d+)x(\d+)b", model_name)
if moe_match: if moe_match:
total_billion = int(moe_match.group(1)) * int(moe_match.group(2)) total_billion = int(moe_match.group(1)) * int(moe_match.group(2))
if total_billion <= 56: if total_billion <= FIREWORKS_AI_56_B_MOE:
return "fireworks-ai-moe-up-to-56b" return "fireworks-ai-moe-up-to-56b"
elif total_billion <= 176: elif total_billion <= FIREWORKS_AI_176_B_MOE:
return "fireworks-ai-56b-to-176b" return "fireworks-ai-56b-to-176b"
# Check for standard models in the form <number>b # Check for standard models in the form <number>b
@ -37,9 +43,9 @@ def get_base_model_for_pricing(model_name: str) -> str:
params_billion = float(params_match) params_billion = float(params_match)
# Determine the category based on the number of parameters # Determine the category based on the number of parameters
if params_billion <= 16.0: if params_billion <= FIREWORKS_AI_16_B:
return "fireworks-ai-up-to-16b" return "fireworks-ai-up-to-16b"
elif params_billion <= 80.0: elif params_billion <= FIREWORKS_AI_80_B:
return "fireworks-ai-16b-80b" return "fireworks-ai-16b-80b"
# If no matches, return the original model_name # If no matches, return the original model_name

View file

@ -2,6 +2,7 @@ from typing import TYPE_CHECKING, Any, List, Literal, Optional, Union
from httpx import Headers, Response from httpx import Headers, Response
from litellm.constants import DEFAULT_MAX_TOKENS
from litellm.llms.base_llm.chat.transformation import BaseConfig, BaseLLMException from litellm.llms.base_llm.chat.transformation import BaseConfig, BaseLLMException
from litellm.types.llms.openai import AllMessageValues from litellm.types.llms.openai import AllMessageValues
from litellm.types.utils import ModelResponse from litellm.types.utils import ModelResponse
@ -27,7 +28,7 @@ class PredibaseConfig(BaseConfig):
decoder_input_details: Optional[bool] = None decoder_input_details: Optional[bool] = None
details: bool = True # enables returning logprobs + best of details: bool = True # enables returning logprobs + best of
max_new_tokens: int = ( max_new_tokens: int = (
256 # openai default - requests hang if max_new_tokens not given DEFAULT_MAX_TOKENS # openai default - requests hang if max_new_tokens not given
) )
repetition_penalty: Optional[float] = None repetition_penalty: Optional[float] = None
return_full_text: Optional[ return_full_text: Optional[

View file

@ -4,6 +4,7 @@ import time
from typing import Callable, List, Union from typing import Callable, List, Union
import litellm import litellm
from litellm.constants import REPLICATE_POLLING_DELAY_SECONDS
from litellm.llms.custom_httpx.http_handler import ( from litellm.llms.custom_httpx.http_handler import (
AsyncHTTPHandler, AsyncHTTPHandler,
HTTPHandler, HTTPHandler,
@ -28,7 +29,9 @@ def handle_prediction_response_streaming(
status = "" status = ""
while True and (status not in ["succeeded", "failed", "canceled"]): while True and (status not in ["succeeded", "failed", "canceled"]):
time.sleep(0.5) # prevent being rate limited by replicate time.sleep(
REPLICATE_POLLING_DELAY_SECONDS
) # prevent being rate limited by replicate
print_verbose(f"replicate: polling endpoint: {prediction_url}") print_verbose(f"replicate: polling endpoint: {prediction_url}")
response = http_client.get(prediction_url, headers=headers) response = http_client.get(prediction_url, headers=headers)
if response.status_code == 200: if response.status_code == 200:
@ -77,7 +80,9 @@ async def async_handle_prediction_response_streaming(
status = "" status = ""
while True and (status not in ["succeeded", "failed", "canceled"]): while True and (status not in ["succeeded", "failed", "canceled"]):
await asyncio.sleep(0.5) # prevent being rate limited by replicate await asyncio.sleep(
REPLICATE_POLLING_DELAY_SECONDS
) # prevent being rate limited by replicate
print_verbose(f"replicate: polling endpoint: {prediction_url}") print_verbose(f"replicate: polling endpoint: {prediction_url}")
response = await http_client.get(prediction_url, headers=headers) response = await http_client.get(prediction_url, headers=headers)
if response.status_code == 200: if response.status_code == 200:

View file

@ -3,6 +3,7 @@ from typing import TYPE_CHECKING, Any, List, Optional, Union
import httpx import httpx
import litellm import litellm
from litellm.constants import REPLICATE_MODEL_NAME_WITH_ID_LENGTH
from litellm.litellm_core_utils.prompt_templates.common_utils import ( from litellm.litellm_core_utils.prompt_templates.common_utils import (
convert_content_list_to_str, convert_content_list_to_str,
) )
@ -221,10 +222,11 @@ class ReplicateConfig(BaseConfig):
version_id = self.model_to_version_id(model) version_id = self.model_to_version_id(model)
request_data: dict = {"input": input_data} request_data: dict = {"input": input_data}
if ":" in version_id and len(version_id) > 64: if ":" in version_id and len(version_id) > REPLICATE_MODEL_NAME_WITH_ID_LENGTH:
model_parts = version_id.split(":") model_parts = version_id.split(":")
if ( if (
len(model_parts) > 1 and len(model_parts[1]) == 64 len(model_parts) > 1
and len(model_parts[1]) == REPLICATE_MODEL_NAME_WITH_ID_LENGTH
): ## checks if model name has a 64 digit code - e.g. "meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3" ): ## checks if model name has a 64 digit code - e.g. "meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3"
request_data["version"] = model_parts[1] request_data["version"] = model_parts[1]

View file

@ -4,6 +4,16 @@ Handles calculating cost for together ai models
import re import re
from litellm.constants import (
TOGETHER_AI_4_B,
TOGETHER_AI_8_B,
TOGETHER_AI_21_B,
TOGETHER_AI_41_B,
TOGETHER_AI_80_B,
TOGETHER_AI_110_B,
TOGETHER_AI_EMBEDDING_150_M,
TOGETHER_AI_EMBEDDING_350_M,
)
from litellm.types.utils import CallTypes from litellm.types.utils import CallTypes
@ -31,17 +41,17 @@ def get_model_params_and_category(model_name, call_type: CallTypes) -> str:
else: else:
return model_name return model_name
# Determine the category based on the number of parameters # Determine the category based on the number of parameters
if params_billion <= 4.0: if params_billion <= TOGETHER_AI_4_B:
category = "together-ai-up-to-4b" category = "together-ai-up-to-4b"
elif params_billion <= 8.0: elif params_billion <= TOGETHER_AI_8_B:
category = "together-ai-4.1b-8b" category = "together-ai-4.1b-8b"
elif params_billion <= 21.0: elif params_billion <= TOGETHER_AI_21_B:
category = "together-ai-8.1b-21b" category = "together-ai-8.1b-21b"
elif params_billion <= 41.0: elif params_billion <= TOGETHER_AI_41_B:
category = "together-ai-21.1b-41b" category = "together-ai-21.1b-41b"
elif params_billion <= 80.0: elif params_billion <= TOGETHER_AI_80_B:
category = "together-ai-41.1b-80b" category = "together-ai-41.1b-80b"
elif params_billion <= 110.0: elif params_billion <= TOGETHER_AI_110_B:
category = "together-ai-81.1b-110b" category = "together-ai-81.1b-110b"
if category is not None: if category is not None:
return category return category
@ -69,9 +79,9 @@ def get_model_params_and_category_embeddings(model_name) -> str:
else: else:
return model_name return model_name
# Determine the category based on the number of parameters # Determine the category based on the number of parameters
if params_million <= 150: if params_million <= TOGETHER_AI_EMBEDDING_150_M:
category = "together-ai-embedding-up-to-150m" category = "together-ai-embedding-up-to-150m"
elif params_million <= 350: elif params_million <= TOGETHER_AI_EMBEDDING_350_M:
category = "together-ai-embedding-151m-to-350m" category = "together-ai-embedding-151m-to-350m"
if category is not None: if category is not None:
return category return category

View file

@ -7,6 +7,7 @@ from typing import Any, AsyncIterator, Dict, Iterator, List, Literal, Optional,
from httpx import Headers, Response from httpx import Headers, Response
from litellm.constants import DEFAULT_MAX_TOKENS_FOR_TRITON
from litellm.litellm_core_utils.prompt_templates.factory import prompt_factory from litellm.litellm_core_utils.prompt_templates.factory import prompt_factory
from litellm.llms.base_llm.base_model_iterator import BaseModelResponseIterator from litellm.llms.base_llm.base_model_iterator import BaseModelResponseIterator
from litellm.llms.base_llm.chat.transformation import ( from litellm.llms.base_llm.chat.transformation import (
@ -196,7 +197,9 @@ class TritonGenerateConfig(TritonConfig):
data_for_triton: Dict[str, Any] = { data_for_triton: Dict[str, Any] = {
"text_input": prompt_factory(model=model, messages=messages), "text_input": prompt_factory(model=model, messages=messages),
"parameters": { "parameters": {
"max_tokens": int(optional_params.get("max_tokens", 2000)), "max_tokens": int(
optional_params.get("max_tokens", DEFAULT_MAX_TOKENS_FOR_TRITON)
),
"bad_words": [""], "bad_words": [""],
"stop_words": [""], "stop_words": [""],
}, },

View file

@ -51,6 +51,10 @@ from litellm import ( # type: ignore
get_litellm_params, get_litellm_params,
get_optional_params, get_optional_params,
) )
from litellm.constants import (
DEFAULT_MOCK_RESPONSE_COMPLETION_TOKEN_COUNT,
DEFAULT_MOCK_RESPONSE_PROMPT_TOKEN_COUNT,
)
from litellm.exceptions import LiteLLMUnknownProvider from litellm.exceptions import LiteLLMUnknownProvider
from litellm.integrations.custom_logger import CustomLogger from litellm.integrations.custom_logger import CustomLogger
from litellm.litellm_core_utils.audio_utils.utils import get_audio_file_for_health_check from litellm.litellm_core_utils.audio_utils.utils import get_audio_file_for_health_check
@ -740,7 +744,12 @@ def mock_completion(
setattr( setattr(
model_response, model_response,
"usage", "usage",
Usage(prompt_tokens=10, completion_tokens=20, total_tokens=30), Usage(
prompt_tokens=DEFAULT_MOCK_RESPONSE_PROMPT_TOKEN_COUNT,
completion_tokens=DEFAULT_MOCK_RESPONSE_COMPLETION_TOKEN_COUNT,
total_tokens=DEFAULT_MOCK_RESPONSE_PROMPT_TOKEN_COUNT
+ DEFAULT_MOCK_RESPONSE_COMPLETION_TOKEN_COUNT,
),
) )
try: try:
@ -3067,7 +3076,7 @@ def completion( # type: ignore # noqa: PLR0915
"max_tokens": max_tokens, "max_tokens": max_tokens,
"temperature": temperature, "temperature": temperature,
"top_p": top_p, "top_p": top_p,
"top_k": kwargs.get("top_k", 40), "top_k": kwargs.get("top_k"),
}, },
}, },
) )

View file

@ -20,6 +20,7 @@ import litellm
from litellm._logging import verbose_proxy_logger from litellm._logging import verbose_proxy_logger
from litellm.caching.caching import DualCache from litellm.caching.caching import DualCache
from litellm.caching.dual_cache import LimitedSizeOrderedDict from litellm.caching.dual_cache import LimitedSizeOrderedDict
from litellm.constants import DEFAULT_IN_MEMORY_TTL
from litellm.litellm_core_utils.get_llm_provider_logic import get_llm_provider from litellm.litellm_core_utils.get_llm_provider_logic import get_llm_provider
from litellm.proxy._types import ( from litellm.proxy._types import (
RBAC_ROLES, RBAC_ROLES,
@ -55,7 +56,7 @@ else:
last_db_access_time = LimitedSizeOrderedDict(max_size=100) last_db_access_time = LimitedSizeOrderedDict(max_size=100)
db_cache_expiry = 5 # refresh every 5s db_cache_expiry = DEFAULT_IN_MEMORY_TTL # refresh every 5s
all_routes = LiteLLMRoutes.openai_routes.value + LiteLLMRoutes.management_routes.value all_routes = LiteLLMRoutes.openai_routes.value + LiteLLMRoutes.management_routes.value

View file

@ -9,6 +9,7 @@ from typing import Optional
import httpx import httpx
from litellm._logging import verbose_proxy_logger from litellm._logging import verbose_proxy_logger
from litellm.constants import NON_LLM_CONNECTION_TIMEOUT
from litellm.llms.custom_httpx.http_handler import HTTPHandler from litellm.llms.custom_httpx.http_handler import HTTPHandler
@ -23,7 +24,7 @@ class LicenseCheck:
def __init__(self) -> None: def __init__(self) -> None:
self.license_str = os.getenv("LITELLM_LICENSE", None) self.license_str = os.getenv("LITELLM_LICENSE", None)
verbose_proxy_logger.debug("License Str value - {}".format(self.license_str)) verbose_proxy_logger.debug("License Str value - {}".format(self.license_str))
self.http_handler = HTTPHandler(timeout=15) self.http_handler = HTTPHandler(timeout=NON_LLM_CONNECTION_TIMEOUT)
self.public_key = None self.public_key = None
self.read_public_key() self.read_public_key()

View file

@ -15,6 +15,7 @@ from fastapi import HTTPException
import litellm import litellm
from litellm._logging import verbose_proxy_logger from litellm._logging import verbose_proxy_logger
from litellm.caching.caching import DualCache from litellm.caching.caching import DualCache
from litellm.constants import DEFAULT_PROMPT_INJECTION_SIMILARITY_THRESHOLD
from litellm.integrations.custom_logger import CustomLogger from litellm.integrations.custom_logger import CustomLogger
from litellm.litellm_core_utils.prompt_templates.factory import ( from litellm.litellm_core_utils.prompt_templates.factory import (
prompt_injection_detection_default_pt, prompt_injection_detection_default_pt,
@ -110,7 +111,9 @@ class _OPTIONAL_PromptInjectionDetection(CustomLogger):
return combinations return combinations
def check_user_input_similarity( def check_user_input_similarity(
self, user_input: str, similarity_threshold: float = 0.7 self,
user_input: str,
similarity_threshold: float = DEFAULT_PROMPT_INJECTION_SIMILARITY_THRESHOLD,
) -> bool: ) -> bool:
user_input_lower = user_input.lower() user_input_lower = user_input.lower()
keywords = self.generate_injection_keywords() keywords = self.generate_injection_keywords()

View file

@ -24,7 +24,7 @@ from fastapi import APIRouter, Depends, Header, HTTPException, Query, Request, s
import litellm import litellm
from litellm._logging import verbose_proxy_logger from litellm._logging import verbose_proxy_logger
from litellm.caching import DualCache from litellm.caching import DualCache
from litellm.constants import UI_SESSION_TOKEN_TEAM_ID from litellm.constants import LENGTH_OF_LITELLM_GENERATED_KEY, UI_SESSION_TOKEN_TEAM_ID
from litellm.litellm_core_utils.duration_parser import duration_in_seconds from litellm.litellm_core_utils.duration_parser import duration_in_seconds
from litellm.proxy._types import * from litellm.proxy._types import *
from litellm.proxy.auth.auth_checks import ( from litellm.proxy.auth.auth_checks import (
@ -1164,7 +1164,7 @@ async def generate_key_helper_fn( # noqa: PLR0915
if key is not None: if key is not None:
token = key token = key
else: else:
token = f"sk-{secrets.token_urlsafe(16)}" token = f"sk-{secrets.token_urlsafe(LENGTH_OF_LITELLM_GENERATED_KEY)}"
if duration is None: # allow tokens that never expire if duration is None: # allow tokens that never expire
expires = None expires = None
@ -1745,7 +1745,7 @@ async def regenerate_key_fn(
verbose_proxy_logger.debug("key_in_db: %s", _key_in_db) verbose_proxy_logger.debug("key_in_db: %s", _key_in_db)
new_token = f"sk-{secrets.token_urlsafe(16)}" new_token = f"sk-{secrets.token_urlsafe(LENGTH_OF_LITELLM_GENERATED_KEY)}"
new_token_hash = hash_token(new_token) new_token_hash = hash_token(new_token)
new_token_key_name = f"sk-...{new_token[-4:]}" new_token_key_name = f"sk-...{new_token[-4:]}"

View file

@ -15,6 +15,10 @@ from litellm.litellm_core_utils.litellm_logging import (
) )
from litellm.litellm_core_utils.thread_pool_executor import executor from litellm.litellm_core_utils.thread_pool_executor import executor
from litellm.proxy.pass_through_endpoints.types import PassthroughStandardLoggingPayload from litellm.proxy.pass_through_endpoints.types import PassthroughStandardLoggingPayload
from litellm.types.passthrough_endpoints.assembly_ai import (
ASSEMBLY_AI_MAX_POLLING_ATTEMPTS,
ASSEMBLY_AI_POLLING_INTERVAL,
)
class AssemblyAITranscriptResponse(TypedDict, total=False): class AssemblyAITranscriptResponse(TypedDict, total=False):
@ -34,13 +38,13 @@ class AssemblyAIPassthroughLoggingHandler:
The base URL for the AssemblyAI API The base URL for the AssemblyAI API
""" """
self.polling_interval: float = 10 self.polling_interval: float = ASSEMBLY_AI_POLLING_INTERVAL
""" """
The polling interval for the AssemblyAI API. The polling interval for the AssemblyAI API.
litellm needs to poll the GET /transcript/{transcript_id} endpoint to get the status of the transcript. litellm needs to poll the GET /transcript/{transcript_id} endpoint to get the status of the transcript.
""" """
self.max_polling_attempts = 180 self.max_polling_attempts = ASSEMBLY_AI_MAX_POLLING_ATTEMPTS
""" """
The maximum number of polling attempts for the AssemblyAI API. The maximum number of polling attempts for the AssemblyAI API.
""" """

View file

@ -25,7 +25,10 @@ from typing import (
get_type_hints, get_type_hints,
) )
from litellm.constants import DEFAULT_MAX_RECURSE_DEPTH from litellm.constants import (
DEFAULT_MAX_RECURSE_DEPTH,
DEFAULT_SLACK_ALERTING_THRESHOLD,
)
from litellm.types.utils import ( from litellm.types.utils import (
ModelResponse, ModelResponse,
ModelResponseStream, ModelResponseStream,
@ -118,7 +121,16 @@ import litellm
from litellm import Router from litellm import Router
from litellm._logging import verbose_proxy_logger, verbose_router_logger from litellm._logging import verbose_proxy_logger, verbose_router_logger
from litellm.caching.caching import DualCache, RedisCache from litellm.caching.caching import DualCache, RedisCache
from litellm.constants import LITELLM_PROXY_ADMIN_NAME from litellm.constants import (
DAYS_IN_A_MONTH,
DEFAULT_HEALTH_CHECK_INTERVAL,
DEFAULT_MODEL_CREATED_AT_TIME,
LITELLM_PROXY_ADMIN_NAME,
PROMETHEUS_FALLBACK_STATS_SEND_TIME_HOURS,
PROXY_BATCH_WRITE_AT,
PROXY_BUDGET_RESCHEDULER_MAX_TIME,
PROXY_BUDGET_RESCHEDULER_MIN_TIME,
)
from litellm.exceptions import RejectedRequestError from litellm.exceptions import RejectedRequestError
from litellm.integrations.SlackAlerting.slack_alerting import SlackAlerting from litellm.integrations.SlackAlerting.slack_alerting import SlackAlerting
from litellm.litellm_core_utils.core_helpers import ( from litellm.litellm_core_utils.core_helpers import (
@ -287,7 +299,7 @@ from litellm.router import (
LiteLLM_Params, LiteLLM_Params,
ModelGroupInfo, ModelGroupInfo,
) )
from litellm.scheduler import DefaultPriorities, FlowItem, Scheduler from litellm.scheduler import FlowItem, Scheduler
from litellm.secret_managers.aws_secret_manager import load_aws_kms from litellm.secret_managers.aws_secret_manager import load_aws_kms
from litellm.secret_managers.google_kms import load_google_kms from litellm.secret_managers.google_kms import load_google_kms
from litellm.secret_managers.main import ( from litellm.secret_managers.main import (
@ -307,6 +319,7 @@ from litellm.types.llms.openai import HttpxBinaryResponseContent
from litellm.types.router import DeploymentTypedDict from litellm.types.router import DeploymentTypedDict
from litellm.types.router import ModelInfo as RouterModelInfo from litellm.types.router import ModelInfo as RouterModelInfo
from litellm.types.router import RouterGeneralSettings, updateDeployment from litellm.types.router import RouterGeneralSettings, updateDeployment
from litellm.types.scheduler import DefaultPriorities
from litellm.types.utils import CredentialItem, CustomHuggingfaceTokenizer from litellm.types.utils import CredentialItem, CustomHuggingfaceTokenizer
from litellm.types.utils import ModelInfo as ModelMapInfo from litellm.types.utils import ModelInfo as ModelMapInfo
from litellm.types.utils import RawRequestTypedDict, StandardLoggingPayload from litellm.types.utils import RawRequestTypedDict, StandardLoggingPayload
@ -779,9 +792,9 @@ queue: List = []
litellm_proxy_budget_name = "litellm-proxy-budget" litellm_proxy_budget_name = "litellm-proxy-budget"
litellm_proxy_admin_name = LITELLM_PROXY_ADMIN_NAME litellm_proxy_admin_name = LITELLM_PROXY_ADMIN_NAME
ui_access_mode: Literal["admin", "all"] = "all" ui_access_mode: Literal["admin", "all"] = "all"
proxy_budget_rescheduler_min_time = 597 proxy_budget_rescheduler_min_time = PROXY_BUDGET_RESCHEDULER_MIN_TIME
proxy_budget_rescheduler_max_time = 605 proxy_budget_rescheduler_max_time = PROXY_BUDGET_RESCHEDULER_MAX_TIME
proxy_batch_write_at = 10 # in seconds proxy_batch_write_at = PROXY_BATCH_WRITE_AT
litellm_master_key_hash = None litellm_master_key_hash = None
disable_spend_logs = False disable_spend_logs = False
jwt_handler = JWTHandler() jwt_handler = JWTHandler()
@ -1846,7 +1859,9 @@ class ProxyConfig:
use_background_health_checks = general_settings.get( use_background_health_checks = general_settings.get(
"background_health_checks", False "background_health_checks", False
) )
health_check_interval = general_settings.get("health_check_interval", 300) health_check_interval = general_settings.get(
"health_check_interval", DEFAULT_HEALTH_CHECK_INTERVAL
)
health_check_details = general_settings.get("health_check_details", True) health_check_details = general_settings.get("health_check_details", True)
### RBAC ### ### RBAC ###
@ -3145,7 +3160,7 @@ class ProxyStartupEvent:
scheduler.add_job( scheduler.add_job(
proxy_logging_obj.slack_alerting_instance.send_fallback_stats_from_prometheus, proxy_logging_obj.slack_alerting_instance.send_fallback_stats_from_prometheus,
"cron", "cron",
hour=9, hour=PROMETHEUS_FALLBACK_STATS_SEND_TIME_HOURS,
minute=0, minute=0,
timezone=ZoneInfo("America/Los_Angeles"), # Pacific Time timezone=ZoneInfo("America/Los_Angeles"), # Pacific Time
) )
@ -3278,7 +3293,7 @@ async def model_list(
{ {
"id": model, "id": model,
"object": "model", "object": "model",
"created": 1677610602, "created": DEFAULT_MODEL_CREATED_AT_TIME,
"owned_by": "openai", "owned_by": "openai",
} }
for model in all_models for model in all_models
@ -5592,7 +5607,7 @@ async def model_metrics(
param="None", param="None",
code=status.HTTP_500_INTERNAL_SERVER_ERROR, code=status.HTTP_500_INTERNAL_SERVER_ERROR,
) )
startTime = startTime or datetime.now() - timedelta(days=30) startTime = startTime or datetime.now() - timedelta(days=DAYS_IN_A_MONTH)
endTime = endTime or datetime.now() endTime = endTime or datetime.now()
if api_key is None or api_key == "undefined": if api_key is None or api_key == "undefined":
@ -5713,11 +5728,12 @@ async def model_metrics_slow_responses(
if customer is None or customer == "undefined": if customer is None or customer == "undefined":
customer = "null" customer = "null"
startTime = startTime or datetime.now() - timedelta(days=30) startTime = startTime or datetime.now() - timedelta(days=DAYS_IN_A_MONTH)
endTime = endTime or datetime.now() endTime = endTime or datetime.now()
alerting_threshold = ( alerting_threshold = (
proxy_logging_obj.slack_alerting_instance.alerting_threshold or 300 proxy_logging_obj.slack_alerting_instance.alerting_threshold
or DEFAULT_SLACK_ALERTING_THRESHOLD
) )
alerting_threshold = int(alerting_threshold) alerting_threshold = int(alerting_threshold)
@ -5797,7 +5813,7 @@ async def model_metrics_exceptions(
code=status.HTTP_500_INTERNAL_SERVER_ERROR, code=status.HTTP_500_INTERNAL_SERVER_ERROR,
) )
startTime = startTime or datetime.now() - timedelta(days=30) startTime = startTime or datetime.now() - timedelta(days=DAYS_IN_A_MONTH)
endTime = endTime or datetime.now() endTime = endTime or datetime.now()
if api_key is None or api_key == "undefined": if api_key is None or api_key == "undefined":

View file

@ -22,6 +22,7 @@ from typing import (
overload, overload,
) )
from litellm.constants import MAX_TEAM_LIST_LIMIT
from litellm.proxy._types import ( from litellm.proxy._types import (
DB_CONNECTION_ERROR_TYPES, DB_CONNECTION_ERROR_TYPES,
CommonProxyErrors, CommonProxyErrors,
@ -1596,7 +1597,9 @@ class PrismaClient:
where={"team_id": {"in": team_id_list}} where={"team_id": {"in": team_id_list}}
) )
elif query_type == "find_all" and team_id_list is None: elif query_type == "find_all" and team_id_list is None:
response = await self.db.litellm_teamtable.find_many(take=20) response = await self.db.litellm_teamtable.find_many(
take=MAX_TEAM_LIST_LIMIT
)
return response return response
elif table_name == "user_notification": elif table_name == "user_notification":
if query_type == "find_unique": if query_type == "find_unique":

View file

@ -50,6 +50,7 @@ from litellm.caching.caching import (
RedisCache, RedisCache,
RedisClusterCache, RedisClusterCache,
) )
from litellm.constants import DEFAULT_MAX_LRU_CACHE_SIZE
from litellm.integrations.custom_logger import CustomLogger from litellm.integrations.custom_logger import CustomLogger
from litellm.litellm_core_utils.asyncify import run_async_function from litellm.litellm_core_utils.asyncify import run_async_function
from litellm.litellm_core_utils.core_helpers import _get_parent_otel_span_from_kwargs from litellm.litellm_core_utils.core_helpers import _get_parent_otel_span_from_kwargs
@ -5073,7 +5074,7 @@ class Router:
rpm_usage += t rpm_usage += t
return tpm_usage, rpm_usage return tpm_usage, rpm_usage
@lru_cache(maxsize=64) @lru_cache(maxsize=DEFAULT_MAX_LRU_CACHE_SIZE)
def _cached_get_model_group_info( def _cached_get_model_group_info(
self, model_group: str self, model_group: str
) -> Optional[ModelGroupInfo]: ) -> Optional[ModelGroupInfo]:

View file

@ -1,6 +1,7 @@
from typing import TYPE_CHECKING, Any, Optional, Union from typing import TYPE_CHECKING, Any, Optional, Union
from litellm._logging import verbose_router_logger from litellm._logging import verbose_router_logger
from litellm.constants import MAX_EXCEPTION_MESSAGE_LENGTH
from litellm.router_utils.cooldown_handlers import ( from litellm.router_utils.cooldown_handlers import (
_async_get_cooldown_deployments_with_debug_info, _async_get_cooldown_deployments_with_debug_info,
) )
@ -54,7 +55,7 @@ async def send_llm_exception_alert(
exception_str = str(original_exception) exception_str = str(original_exception)
if litellm_debug_info is not None: if litellm_debug_info is not None:
exception_str += litellm_debug_info exception_str += litellm_debug_info
exception_str += f"\n\n{error_traceback_str[:2000]}" exception_str += f"\n\n{error_traceback_str[:MAX_EXCEPTION_MESSAGE_LENGTH]}"
await litellm_router_instance.slack_alerting_logger.send_alert( await litellm_router_instance.slack_alerting_logger.send_alert(
message=f"LLM API call failed: `{exception_str}`", message=f"LLM API call failed: `{exception_str}`",

View file

@ -6,17 +6,14 @@ from pydantic import BaseModel
from litellm import print_verbose from litellm import print_verbose
from litellm.caching.caching import DualCache, RedisCache from litellm.caching.caching import DualCache, RedisCache
from litellm.constants import DEFAULT_IN_MEMORY_TTL, DEFAULT_POLLING_INTERVAL
class SchedulerCacheKeys(enum.Enum): class SchedulerCacheKeys(enum.Enum):
queue = "scheduler:queue" queue = "scheduler:queue"
default_in_memory_ttl = 5 # cache queue in-memory for 5s when redis cache available default_in_memory_ttl = (
DEFAULT_IN_MEMORY_TTL # cache queue in-memory for 5s when redis cache available
)
class DefaultPriorities(enum.Enum):
High = 0
Medium = 128
Low = 255
class FlowItem(BaseModel): class FlowItem(BaseModel):
@ -44,7 +41,9 @@ class Scheduler:
self.cache = DualCache( self.cache = DualCache(
redis_cache=redis_cache, default_in_memory_ttl=default_in_memory_ttl redis_cache=redis_cache, default_in_memory_ttl=default_in_memory_ttl
) )
self.polling_interval = polling_interval or 0.03 # default to 3ms self.polling_interval = (
polling_interval or DEFAULT_POLLING_INTERVAL
) # default to 3ms
async def add_request(self, request: FlowItem): async def add_request(self, request: FlowItem):
# We use the priority directly, as lower values indicate higher priority # We use the priority directly, as lower values indicate higher priority

View file

@ -5,6 +5,7 @@ from typing import Optional
import litellm import litellm
from litellm._logging import verbose_logger from litellm._logging import verbose_logger
from litellm.caching.caching import InMemoryCache from litellm.caching.caching import InMemoryCache
from litellm.constants import SECRET_MANAGER_REFRESH_INTERVAL
from litellm.integrations.gcs_bucket.gcs_bucket_base import GCSBucketBase from litellm.integrations.gcs_bucket.gcs_bucket_base import GCSBucketBase
from litellm.llms.custom_httpx.http_handler import _get_httpx_client from litellm.llms.custom_httpx.http_handler import _get_httpx_client
from litellm.proxy._types import CommonProxyErrors, KeyManagementSystem from litellm.proxy._types import CommonProxyErrors, KeyManagementSystem
@ -13,7 +14,7 @@ from litellm.proxy._types import CommonProxyErrors, KeyManagementSystem
class GoogleSecretManager(GCSBucketBase): class GoogleSecretManager(GCSBucketBase):
def __init__( def __init__(
self, self,
refresh_interval: Optional[int] = 86400, refresh_interval: Optional[int] = SECRET_MANAGER_REFRESH_INTERVAL,
always_read_secret_manager: Optional[bool] = False, always_read_secret_manager: Optional[bool] = False,
) -> None: ) -> None:
""" """

View file

@ -6,6 +6,7 @@ import httpx
import litellm import litellm
from litellm._logging import verbose_logger from litellm._logging import verbose_logger
from litellm.caching import InMemoryCache from litellm.caching import InMemoryCache
from litellm.constants import SECRET_MANAGER_REFRESH_INTERVAL
from litellm.llms.custom_httpx.http_handler import ( from litellm.llms.custom_httpx.http_handler import (
_get_httpx_client, _get_httpx_client,
get_async_httpx_client, get_async_httpx_client,
@ -39,8 +40,14 @@ class HashicorpSecretManager(BaseSecretManager):
litellm.secret_manager_client = self litellm.secret_manager_client = self
litellm._key_management_system = KeyManagementSystem.HASHICORP_VAULT litellm._key_management_system = KeyManagementSystem.HASHICORP_VAULT
_refresh_interval = os.environ.get("HCP_VAULT_REFRESH_INTERVAL", 86400) _refresh_interval = os.environ.get(
_refresh_interval = int(_refresh_interval) if _refresh_interval else 86400 "HCP_VAULT_REFRESH_INTERVAL", SECRET_MANAGER_REFRESH_INTERVAL
)
_refresh_interval = (
int(_refresh_interval)
if _refresh_interval
else SECRET_MANAGER_REFRESH_INTERVAL
)
self.cache = InMemoryCache( self.cache = InMemoryCache(
default_ttl=_refresh_interval default_ttl=_refresh_interval
) # store in memory for 1 day ) # store in memory for 1 day

View file

@ -1,6 +1,8 @@
from enum import Enum from enum import Enum
from typing import Optional, TypedDict from typing import Optional, TypedDict
DD_MAX_BATCH_SIZE = 1000
class DataDogStatus(str, Enum): class DataDogStatus(str, Enum):
INFO = "info" INFO = "info"

View file

@ -8,6 +8,10 @@ else:
VertexBase = Any VertexBase = Any
GCS_DEFAULT_BATCH_SIZE = 2048
GCS_DEFAULT_FLUSH_INTERVAL_SECONDS = 20
class GCSLoggingConfig(TypedDict): class GCSLoggingConfig(TypedDict):
""" """
Internal LiteLLM Config for GCS Bucket logging Internal LiteLLM Config for GCS Bucket logging

View file

@ -7,6 +7,9 @@ from pydantic import BaseModel, Field
from litellm.types.utils import LiteLLMPydanticObjectBase from litellm.types.utils import LiteLLMPydanticObjectBase
SLACK_ALERTING_THRESHOLD_5_PERCENT = 0.05
SLACK_ALERTING_THRESHOLD_15_PERCENT = 0.15
class BaseOutageModel(TypedDict): class BaseOutageModel(TypedDict):
alerts: List[int] alerts: List[int]

View file

@ -0,0 +1,2 @@
API_VERSION_YEAR_SUPPORTED_RESPONSE_FORMAT = 2024
API_VERSION_MONTH_SUPPORTED_RESPONSE_FORMAT = 8

View file

@ -0,0 +1 @@

View file

@ -0,0 +1,2 @@
ASSEMBLY_AI_POLLING_INTERVAL = 10
ASSEMBLY_AI_MAX_POLLING_ATTEMPTS = 180

View file

@ -0,0 +1,7 @@
from enum import Enum
class DefaultPriorities(Enum):
High = 0
Medium = 128
Low = 255

View file

@ -62,6 +62,16 @@ import litellm.llms.gemini
from litellm.caching._internal_lru_cache import lru_cache_wrapper from litellm.caching._internal_lru_cache import lru_cache_wrapper
from litellm.caching.caching import DualCache from litellm.caching.caching import DualCache
from litellm.caching.caching_handler import CachingHandlerResponse, LLMCachingHandler from litellm.caching.caching_handler import CachingHandlerResponse, LLMCachingHandler
from litellm.constants import (
DEFAULT_MAX_LRU_CACHE_SIZE,
DEFAULT_TRIM_RATIO,
FUNCTION_DEFINITION_TOKEN_COUNT,
INITIAL_RETRY_DELAY,
JITTER,
MAX_RETRY_DELAY,
MINIMUM_PROMPT_CACHE_TOKEN_COUNT,
TOOL_CHOICE_OBJECT_TOKEN_COUNT,
)
from litellm.integrations.custom_guardrail import CustomGuardrail from litellm.integrations.custom_guardrail import CustomGuardrail
from litellm.integrations.custom_logger import CustomLogger from litellm.integrations.custom_logger import CustomLogger
from litellm.litellm_core_utils.core_helpers import ( from litellm.litellm_core_utils.core_helpers import (
@ -1520,7 +1530,7 @@ def _select_tokenizer(
return _select_tokenizer_helper(model=model) return _select_tokenizer_helper(model=model)
@lru_cache(maxsize=128) @lru_cache(maxsize=DEFAULT_MAX_LRU_CACHE_SIZE)
def _select_tokenizer_helper(model: str) -> SelectTokenizerResponse: def _select_tokenizer_helper(model: str) -> SelectTokenizerResponse:
if litellm.disable_hf_tokenizer_download is True: if litellm.disable_hf_tokenizer_download is True:
return _return_openai_tokenizer(model) return _return_openai_tokenizer(model)
@ -5336,15 +5346,15 @@ def _calculate_retry_after(
if retry_after is not None and 0 < retry_after <= 60: if retry_after is not None and 0 < retry_after <= 60:
return retry_after return retry_after
initial_retry_delay = 0.5 initial_retry_delay = INITIAL_RETRY_DELAY
max_retry_delay = 8.0 max_retry_delay = MAX_RETRY_DELAY
nb_retries = max_retries - remaining_retries nb_retries = max_retries - remaining_retries
# Apply exponential backoff, but not more than the max. # Apply exponential backoff, but not more than the max.
sleep_seconds = min(initial_retry_delay * pow(2.0, nb_retries), max_retry_delay) sleep_seconds = min(initial_retry_delay * pow(2.0, nb_retries), max_retry_delay)
# Apply some jitter, plus-or-minus half a second. # Apply some jitter, plus-or-minus half a second.
jitter = 1 - 0.25 * random.random() jitter = JITTER * random.random()
timeout = sleep_seconds * jitter timeout = sleep_seconds * jitter
return timeout if timeout >= min_timeout else min_timeout return timeout if timeout >= min_timeout else min_timeout
@ -5670,7 +5680,7 @@ def shorten_message_to_fit_limit(message, tokens_needed, model: Optional[str]):
def trim_messages( def trim_messages(
messages, messages,
model: Optional[str] = None, model: Optional[str] = None,
trim_ratio: float = 0.75, trim_ratio: float = DEFAULT_TRIM_RATIO,
return_response_tokens: bool = False, return_response_tokens: bool = False,
max_tokens=None, max_tokens=None,
): ):
@ -6543,7 +6553,7 @@ def is_prompt_caching_valid_prompt(
model=model, model=model,
use_default_image_token_count=True, use_default_image_token_count=True,
) )
return token_count >= 1024 return token_count >= MINIMUM_PROMPT_CACHE_TOKEN_COUNT
except Exception as e: except Exception as e:
verbose_logger.error(f"Error in is_prompt_caching_valid_prompt: {e}") verbose_logger.error(f"Error in is_prompt_caching_valid_prompt: {e}")
return False return False

View file

@ -3,6 +3,7 @@ warn_return_any = False
ignore_missing_imports = True ignore_missing_imports = True
mypy_path = litellm/stubs mypy_path = litellm/stubs
namespace_packages = True namespace_packages = True
disable_error_code = valid-type
[mypy-google.*] [mypy-google.*]
ignore_missing_imports = True ignore_missing_imports = True

View file

@ -0,0 +1,152 @@
import sys
import ast
import os
# Extremely restrictive set of allowed numbers
ALLOWED_NUMBERS = {
0,
1,
-1,
2,
10,
100,
1000,
4,
3,
500,
6,
60,
3600,
0.75,
7,
1024,
1011,
600,
12,
1000000000.0,
0.1,
50,
128,
6000,
30,
1000000,
5,
15,
25,
10000,
60000,
8,
2048,
16000000000,
16,
16383,
14,
24,
128000,
0.01,
20,
}
# Add all standard HTTP status codes
HTTP_STATUS_CODES = {
200, # OK
201, # Created
202, # Accepted
204, # No Content
300, # Multiple Choices
301, # Moved Permanently
302, # Found
303, # See Other
304, # Not Modified
307, # Temporary Redirect
308, # Permanent Redirect
400, # Bad Request
401, # Unauthorized
402, # Payment Required
403, # Forbidden
404, # Not Found
406, # Not Acceptable
408, # Request Timeout
409, # Conflict
413, # Payload Too Large
422, # Unprocessable Entity
424, # Failed Dependency
429, # Too Many Requests
498, # Invalid Token
499, # Client Closed Request
500, # Internal Server Error
501, # Not Implemented
502, # Bad Gateway
503, # Service Unavailable
504, # Gateway Timeout
520, # Web server is returning an unknown error
522, # Connection timed out
524, # A timeout occurred
529, # Site is overloaded
}
# Combine the sets
ALLOWED_NUMBERS = ALLOWED_NUMBERS.union(HTTP_STATUS_CODES)
class HardcodedNumberFinder(ast.NodeVisitor):
def __init__(self):
self.hardcoded_numbers = []
def visit_Constant(self, node):
# For Python 3.8+
if isinstance(node.value, (int, float)) and node.value not in ALLOWED_NUMBERS:
self.hardcoded_numbers.append((node.lineno, node.value))
self.generic_visit(node)
def visit_Num(self, node):
# For older Python versions
if node.n not in ALLOWED_NUMBERS:
self.hardcoded_numbers.append((node.lineno, node.n))
self.generic_visit(node)
def check_file(filename):
try:
with open(filename, "r") as f:
content = f.read()
tree = ast.parse(content)
finder = HardcodedNumberFinder()
finder.visit(tree)
if finder.hardcoded_numbers:
print(f"ERROR in {filename}: Hardcoded numbers detected:")
for line, value in finder.hardcoded_numbers:
print(f" Line {line}: {value}")
return 1
return 0
except SyntaxError:
print(f"Syntax error in {filename}")
return 0
def main():
exit_code = 0
folder = "../../litellm"
ignore_files = [
"constants.py",
"proxy_cli.py",
"token_counter.py",
"mock_functions.py",
"duration_parser.py",
"utils.py",
]
ignore_folder = "types"
for root, dirs, files in os.walk(folder):
for filename in files:
if filename.endswith(".py") and filename not in ignore_files:
full_path = os.path.join(root, filename)
if ignore_folder in full_path:
continue
exit_code |= check_file(full_path)
sys.exit(exit_code)
if __name__ == "__main__":
main()

View file