refactor: complete migration

This commit is contained in:
Krrish Dholakia 2025-03-24 19:52:45 -07:00
parent bfc159172d
commit cb6e9fbe40
32 changed files with 203 additions and 210 deletions

View file

@ -19,6 +19,7 @@ from pydantic import BaseModel
import litellm import litellm
from litellm._logging import verbose_logger from litellm._logging import verbose_logger
from litellm.constants import CACHED_STREAMING_CHUNK_DELAY
from litellm.litellm_core_utils.model_param_helper import ModelParamHelper from litellm.litellm_core_utils.model_param_helper import ModelParamHelper
from litellm.types.caching import * from litellm.types.caching import *
from litellm.types.utils import all_litellm_params from litellm.types.utils import all_litellm_params
@ -406,7 +407,7 @@ class Cache:
} }
] ]
} }
time.sleep(0.02) time.sleep(CACHED_STREAMING_CHUNK_DELAY)
def _get_cache_logic( def _get_cache_logic(
self, self,

View file

@ -15,7 +15,8 @@ from typing import Any, List, Optional
from pydantic import BaseModel from pydantic import BaseModel
from ..constants import MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB from litellm.constants import MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB
from .base_cache import BaseCache from .base_cache import BaseCache
@ -52,7 +53,8 @@ class InMemoryCache(BaseCache):
# Fast path for common primitive types that are typically small # Fast path for common primitive types that are typically small
if ( if (
isinstance(value, (bool, int, float, str)) isinstance(value, (bool, int, float, str))
and len(str(value)) < self.max_size_per_item * 512 and len(str(value))
< self.max_size_per_item * MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB
): # Conservative estimate ): # Conservative estimate
return True return True

View file

@ -11,10 +11,12 @@ Has 4 methods:
import ast import ast
import asyncio import asyncio
import json import json
from typing import Any from typing import Any, cast
import litellm import litellm
from litellm._logging import print_verbose from litellm._logging import print_verbose
from litellm.constants import QDRANT_SCALAR_QUANTILE, QDRANT_VECTOR_SIZE
from litellm.types.utils import EmbeddingResponse
from .base_cache import BaseCache from .base_cache import BaseCache
@ -118,7 +120,11 @@ class QdrantSemanticCache(BaseCache):
} }
elif quantization_config == "scalar": elif quantization_config == "scalar":
quantization_params = { quantization_params = {
"scalar": {"type": "int8", "quantile": 0.99, "always_ram": False} "scalar": {
"type": "int8",
"quantile": QDRANT_SCALAR_QUANTILE,
"always_ram": False,
}
} }
elif quantization_config == "product": elif quantization_config == "product":
quantization_params = { quantization_params = {
@ -132,7 +138,7 @@ class QdrantSemanticCache(BaseCache):
new_collection_status = self.sync_client.put( new_collection_status = self.sync_client.put(
url=f"{self.qdrant_api_base}/collections/{self.collection_name}", url=f"{self.qdrant_api_base}/collections/{self.collection_name}",
json={ json={
"vectors": {"size": 1536, "distance": "Cosine"}, "vectors": {"size": QDRANT_VECTOR_SIZE, "distance": "Cosine"},
"quantization_config": quantization_params, "quantization_config": quantization_params,
}, },
headers=self.headers, headers=self.headers,
@ -171,10 +177,13 @@ class QdrantSemanticCache(BaseCache):
prompt += message["content"] prompt += message["content"]
# create an embedding for prompt # create an embedding for prompt
embedding_response = litellm.embedding( embedding_response = cast(
model=self.embedding_model, EmbeddingResponse,
input=prompt, litellm.embedding(
cache={"no-store": True, "no-cache": True}, model=self.embedding_model,
input=prompt,
cache={"no-store": True, "no-cache": True},
),
) )
# get the embedding # get the embedding
@ -212,10 +221,13 @@ class QdrantSemanticCache(BaseCache):
prompt += message["content"] prompt += message["content"]
# convert to embedding # convert to embedding
embedding_response = litellm.embedding( embedding_response = cast(
model=self.embedding_model, EmbeddingResponse,
input=prompt, litellm.embedding(
cache={"no-store": True, "no-cache": True}, model=self.embedding_model,
input=prompt,
cache={"no-store": True, "no-cache": True},
),
) )
# get the embedding # get the embedding

View file

@ -26,12 +26,19 @@ HOURS_IN_A_DAY = 24
DAYS_IN_A_WEEK = 7 DAYS_IN_A_WEEK = 7
DAYS_IN_A_MONTH = 28 DAYS_IN_A_MONTH = 28
DAYS_IN_A_YEAR = 365 DAYS_IN_A_YEAR = 365
REPLICATE_MODEL_NAME_WITH_ID_LENGTH = 64
#### TOKEN COUNTING #### #### TOKEN COUNTING ####
FUNCTION_DEFINITION_TOKEN_COUNT = 9 FUNCTION_DEFINITION_TOKEN_COUNT = 9
SYSTEM_MESSAGE_TOKEN_COUNT = 4 SYSTEM_MESSAGE_TOKEN_COUNT = 4
TOOL_CHOICE_OBJECT_TOKEN_COUNT = 4 TOOL_CHOICE_OBJECT_TOKEN_COUNT = 4
DEFAULT_MOCK_RESPONSE_PROMPT_TOKEN_COUNT = 10 DEFAULT_MOCK_RESPONSE_PROMPT_TOKEN_COUNT = 10
DEFAULT_MOCK_RESPONSE_COMPLETION_TOKEN_COUNT = 20 DEFAULT_MOCK_RESPONSE_COMPLETION_TOKEN_COUNT = 20
MAX_SHORT_SIDE_FOR_IMAGE_HIGH_RES = 768
MAX_LONG_SIDE_FOR_IMAGE_HIGH_RES = 2000
MAX_TILE_WIDTH = 512
MAX_TILE_HEIGHT = 512
OPENAI_FILE_SEARCH_COST_PER_1K_CALLS = 2.5 / 1000
MIN_NON_ZERO_TEMPERATURE = 0.0001
#### RELIABILITY #### #### RELIABILITY ####
REPEATED_STREAMING_CHUNK_LIMIT = 100 # catch if model starts looping the same chunk while streaming. Uses high default to prevent false positives. REPEATED_STREAMING_CHUNK_LIMIT = 100 # catch if model starts looping the same chunk while streaming. Uses high default to prevent false positives.
DEFAULT_MAX_LRU_CACHE_SIZE = 16 DEFAULT_MAX_LRU_CACHE_SIZE = 16
@ -40,15 +47,36 @@ MAX_RETRY_DELAY = 8.0
JITTER = 0.75 JITTER = 0.75
DEFAULT_IN_MEMORY_TTL = 5 # default time to live for the in-memory cache DEFAULT_IN_MEMORY_TTL = 5 # default time to live for the in-memory cache
DEFAULT_POLLING_INTERVAL = 0.03 # default polling interval for the scheduler DEFAULT_POLLING_INTERVAL = 0.03 # default polling interval for the scheduler
AZURE_OPERATION_POLLING_TIMEOUT = 120
REDIS_SOCKET_TIMEOUT = 0.1 REDIS_SOCKET_TIMEOUT = 0.1
REDIS_CONNECTION_POOL_TIMEOUT = 5 REDIS_CONNECTION_POOL_TIMEOUT = 5
NON_LLM_CONNECTION_TIMEOUT = 15 # timeout for adjacent services (e.g. jwt auth) NON_LLM_CONNECTION_TIMEOUT = 15 # timeout for adjacent services (e.g. jwt auth)
MAX_EXCEPTION_MESSAGE_LENGTH = 2000
BEDROCK_MAX_POLICY_SIZE = 75
REPLICATE_POLLING_DELAY_SECONDS = 0.5
DEFAULT_ANTHROPIC_CHAT_MAX_TOKENS = 4096
TOGETHER_AI_4_B = 4
TOGETHER_AI_8_B = 8
TOGETHER_AI_21_B = 21
TOGETHER_AI_41_B = 41
TOGETHER_AI_80_B = 80
TOGETHER_AI_110_B = 110
TOGETHER_AI_EMBEDDING_150_M = 150
TOGETHER_AI_EMBEDDING_350_M = 350
QDRANT_SCALAR_QUANTILE = 0.99
QDRANT_VECTOR_SIZE = 1536
CACHED_STREAMING_CHUNK_DELAY = 0.02
MAX_SIZE_PER_ITEM_IN_MEMORY_CACHE_IN_KB = 512
DEFAULT_MAX_TOKENS_FOR_TRITON = 2000
#### Networking settings #### #### Networking settings ####
request_timeout: float = 6000 # time in seconds request_timeout: float = 6000 # time in seconds
STREAM_SSE_DONE_STRING: str = "[DONE]" STREAM_SSE_DONE_STRING: str = "[DONE]"
### SPEND TRACKING ### ### SPEND TRACKING ###
DEFAULT_REPLICATE_GPU_PRICE_PER_SECOND = 0.001400 # price per second for a100 80GB DEFAULT_REPLICATE_GPU_PRICE_PER_SECOND = 0.001400 # price per second for a100 80GB
FIREWORKS_AI_56_B_MOE = 56
FIREWORKS_AI_176_B_MOE = 176
FIREWORKS_AI_16_B = 16
FIREWORKS_AI_80_B = 80
LITELLM_CHAT_PROVIDERS = [ LITELLM_CHAT_PROVIDERS = [
"openai", "openai",

View file

@ -16,6 +16,7 @@ import litellm.litellm_core_utils.litellm_logging
import litellm.types import litellm.types
from litellm._logging import verbose_logger, verbose_proxy_logger from litellm._logging import verbose_logger, verbose_proxy_logger
from litellm.caching.caching import DualCache from litellm.caching.caching import DualCache
from litellm.constants import HOURS_IN_A_DAY
from litellm.integrations.custom_batch_logger import CustomBatchLogger from litellm.integrations.custom_batch_logger import CustomBatchLogger
from litellm.litellm_core_utils.duration_parser import duration_in_seconds from litellm.litellm_core_utils.duration_parser import duration_in_seconds
from litellm.litellm_core_utils.exception_mapping_utils import ( from litellm.litellm_core_utils.exception_mapping_utils import (
@ -646,10 +647,10 @@ class SlackAlerting(CustomBatchLogger):
event_message += ( event_message += (
f"Budget Crossed\n Total Budget:`{user_info.max_budget}`" f"Budget Crossed\n Total Budget:`{user_info.max_budget}`"
) )
elif percent_left <= 0.05: elif percent_left <= SLACK_ALERTING_THRESHOLD_5_PERCENT:
event = "threshold_crossed" event = "threshold_crossed"
event_message += "5% Threshold Crossed " event_message += "5% Threshold Crossed "
elif percent_left <= 0.15: elif percent_left <= SLACK_ALERTING_THRESHOLD_15_PERCENT:
event = "threshold_crossed" event = "threshold_crossed"
event_message += "15% Threshold Crossed" event_message += "15% Threshold Crossed"
elif user_info.soft_budget is not None: elif user_info.soft_budget is not None:
@ -1715,7 +1716,7 @@ Model Info:
await self.internal_usage_cache.async_set_cache( await self.internal_usage_cache.async_set_cache(
key=_event_cache_key, key=_event_cache_key,
value="SENT", value="SENT",
ttl=(30 * 24 * 60 * 60), # 1 month ttl=(30 * HOURS_IN_A_DAY * 60 * 60), # 1 month
) )
except Exception as e: except Exception as e:

View file

@ -41,7 +41,7 @@ from litellm.types.utils import StandardLoggingPayload
from ..additional_logging_utils import AdditionalLoggingUtils from ..additional_logging_utils import AdditionalLoggingUtils
# max number of logs DD API can accept # max number of logs DD API can accept
DD_MAX_BATCH_SIZE = 1000
# specify what ServiceTypes are logged as success events to DD. (We don't want to spam DD traces with large number of service types) # specify what ServiceTypes are logged as success events to DD. (We don't want to spam DD traces with large number of service types)
DD_LOGGED_SUCCESS_SERVICE_TYPES = [ DD_LOGGED_SUCCESS_SERVICE_TYPES = [

View file

@ -20,10 +20,6 @@ else:
VertexBase = Any VertexBase = Any
GCS_DEFAULT_BATCH_SIZE = 2048
GCS_DEFAULT_FLUSH_INTERVAL_SECONDS = 20
class GCSBucketLogger(GCSBucketBase, AdditionalLoggingUtils): class GCSBucketLogger(GCSBucketBase, AdditionalLoggingUtils):
def __init__(self, bucket_name: Optional[str] = None) -> None: def __init__(self, bucket_name: Optional[str] = None) -> None:
from litellm.proxy.proxy_server import premium_user from litellm.proxy.proxy_server import premium_user

View file

@ -3,6 +3,7 @@ from typing import Optional, Tuple
import httpx import httpx
import litellm import litellm
from litellm.constants import REPLICATE_MODEL_NAME_WITH_ID_LENGTH
from litellm.secret_managers.main import get_secret, get_secret_str from litellm.secret_managers.main import get_secret, get_secret_str
from ..types.router import LiteLLM_Params from ..types.router import LiteLLM_Params
@ -256,10 +257,13 @@ def get_llm_provider( # noqa: PLR0915
elif model in litellm.cohere_chat_models: elif model in litellm.cohere_chat_models:
custom_llm_provider = "cohere_chat" custom_llm_provider = "cohere_chat"
## replicate ## replicate
elif model in litellm.replicate_models or (":" in model and len(model) > 64): elif model in litellm.replicate_models or (
":" in model and len(model) > REPLICATE_MODEL_NAME_WITH_ID_LENGTH
):
model_parts = model.split(":") model_parts = model.split(":")
if ( if (
len(model_parts) > 1 and len(model_parts[1]) == 64 len(model_parts) > 1
and len(model_parts[1]) == REPLICATE_MODEL_NAME_WITH_ID_LENGTH
): ## checks if model name has a 64 digit code - e.g. "meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3" ): ## checks if model name has a 64 digit code - e.g. "meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3"
custom_llm_provider = "replicate" custom_llm_provider = "replicate"
elif model in litellm.replicate_models: elif model in litellm.replicate_models:

View file

@ -28,6 +28,10 @@ from litellm._logging import _is_debugging_on, verbose_logger
from litellm.batches.batch_utils import _handle_completed_batch from litellm.batches.batch_utils import _handle_completed_batch
from litellm.caching.caching import DualCache, InMemoryCache from litellm.caching.caching import DualCache, InMemoryCache
from litellm.caching.caching_handler import LLMCachingHandler from litellm.caching.caching_handler import LLMCachingHandler
from litellm.constants import (
DEFAULT_MOCK_RESPONSE_COMPLETION_TOKEN_COUNT,
DEFAULT_MOCK_RESPONSE_PROMPT_TOKEN_COUNT,
)
from litellm.cost_calculator import _select_model_name_for_cost_calc from litellm.cost_calculator import _select_model_name_for_cost_calc
from litellm.integrations.arize.arize import ArizeLogger from litellm.integrations.arize.arize import ArizeLogger
from litellm.integrations.custom_guardrail import CustomGuardrail from litellm.integrations.custom_guardrail import CustomGuardrail
@ -3743,9 +3747,12 @@ def create_dummy_standard_logging_payload() -> StandardLoggingPayload:
response_cost=response_cost, response_cost=response_cost,
response_cost_failure_debug_info=None, response_cost_failure_debug_info=None,
status=str("success"), status=str("success"),
total_tokens=int(30), total_tokens=int(
prompt_tokens=int(20), DEFAULT_MOCK_RESPONSE_PROMPT_TOKEN_COUNT
completion_tokens=int(10), + DEFAULT_MOCK_RESPONSE_COMPLETION_TOKEN_COUNT
),
prompt_tokens=int(DEFAULT_MOCK_RESPONSE_PROMPT_TOKEN_COUNT),
completion_tokens=int(DEFAULT_MOCK_RESPONSE_COMPLETION_TOKEN_COUNT),
startTime=start_time, startTime=start_time,
endTime=end_time, endTime=end_time,
completionStartTime=completion_start_time, completionStartTime=completion_start_time,

View file

@ -5,6 +5,7 @@ Helper utilities for tracking the cost of built-in tools.
from typing import Any, Dict, List, Optional from typing import Any, Dict, List, Optional
import litellm import litellm
from litellm.constants import OPENAI_FILE_SEARCH_COST_PER_1K_CALLS
from litellm.types.llms.openai import FileSearchTool, WebSearchOptions from litellm.types.llms.openai import FileSearchTool, WebSearchOptions
from litellm.types.utils import ( from litellm.types.utils import (
ModelInfo, ModelInfo,
@ -132,7 +133,7 @@ class StandardBuiltInToolCostTracking:
""" """
if file_search is None: if file_search is None:
return 0.0 return 0.0
return 2.5 / 1000 return OPENAI_FILE_SEARCH_COST_PER_1K_CALLS
@staticmethod @staticmethod
def chat_completion_response_includes_annotations( def chat_completion_response_includes_annotations(

View file

@ -11,6 +11,10 @@ from litellm.constants import (
DEFAULT_IMAGE_HEIGHT, DEFAULT_IMAGE_HEIGHT,
DEFAULT_IMAGE_TOKEN_COUNT, DEFAULT_IMAGE_TOKEN_COUNT,
DEFAULT_IMAGE_WIDTH, DEFAULT_IMAGE_WIDTH,
MAX_LONG_SIDE_FOR_IMAGE_HIGH_RES,
MAX_SHORT_SIDE_FOR_IMAGE_HIGH_RES,
MAX_TILE_HEIGHT,
MAX_TILE_WIDTH,
) )
from litellm.llms.custom_httpx.http_handler import _get_httpx_client from litellm.llms.custom_httpx.http_handler import _get_httpx_client
@ -97,11 +101,14 @@ def resize_image_high_res(
height: int, height: int,
) -> Tuple[int, int]: ) -> Tuple[int, int]:
# Maximum dimensions for high res mode # Maximum dimensions for high res mode
max_short_side = 768 max_short_side = MAX_SHORT_SIDE_FOR_IMAGE_HIGH_RES
max_long_side = 2000 max_long_side = MAX_LONG_SIDE_FOR_IMAGE_HIGH_RES
# Return early if no resizing is needed # Return early if no resizing is needed
if width <= 768 and height <= 768: if (
width <= MAX_SHORT_SIDE_FOR_IMAGE_HIGH_RES
and height <= MAX_SHORT_SIDE_FOR_IMAGE_HIGH_RES
):
return width, height return width, height
# Determine the longer and shorter sides # Determine the longer and shorter sides
@ -132,7 +139,10 @@ def resize_image_high_res(
# Test the function with the given example # Test the function with the given example
def calculate_tiles_needed( def calculate_tiles_needed(
resized_width, resized_height, tile_width=512, tile_height=512 resized_width,
resized_height,
tile_width=MAX_TILE_WIDTH,
tile_height=MAX_TILE_HEIGHT,
): ):
tiles_across = (resized_width + tile_width - 1) // tile_width tiles_across = (resized_width + tile_width - 1) // tile_width
tiles_down = (resized_height + tile_height - 1) // tile_height tiles_down = (resized_height + tile_height - 1) // tile_height

View file

@ -5,7 +5,10 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union, cast
import httpx import httpx
import litellm import litellm
from litellm.constants import RESPONSE_FORMAT_TOOL_NAME from litellm.constants import (
DEFAULT_ANTHROPIC_CHAT_MAX_TOKENS,
RESPONSE_FORMAT_TOOL_NAME,
)
from litellm.litellm_core_utils.core_helpers import map_finish_reason from litellm.litellm_core_utils.core_helpers import map_finish_reason
from litellm.litellm_core_utils.prompt_templates.factory import anthropic_messages_pt from litellm.litellm_core_utils.prompt_templates.factory import anthropic_messages_pt
from litellm.llms.base_llm.base_utils import type_to_response_format_param from litellm.llms.base_llm.base_utils import type_to_response_format_param
@ -50,7 +53,7 @@ class AnthropicConfig(BaseConfig):
""" """
max_tokens: Optional[int] = ( max_tokens: Optional[int] = (
4096 # anthropic requires a default value (Opus, Sonnet, and Haiku have the same default) DEFAULT_ANTHROPIC_CHAT_MAX_TOKENS # anthropic requires a default value (Opus, Sonnet, and Haiku have the same default)
) )
stop_sequences: Optional[list] = None stop_sequences: Optional[list] = None
temperature: Optional[int] = None temperature: Optional[int] = None
@ -63,7 +66,7 @@ class AnthropicConfig(BaseConfig):
self, self,
max_tokens: Optional[ max_tokens: Optional[
int int
] = 4096, # You can pass in a value yourself or use the default value 4096 ] = DEFAULT_ANTHROPIC_CHAT_MAX_TOKENS, # You can pass in a value yourself or use the default value 4096
stop_sequences: Optional[list] = None, stop_sequences: Optional[list] = None,
temperature: Optional[int] = None, temperature: Optional[int] = None,
top_p: Optional[int] = None, top_p: Optional[int] = None,

View file

@ -11,6 +11,7 @@ from typing import AsyncIterator, Dict, Iterator, List, Optional, Union
import httpx import httpx
import litellm import litellm
from litellm.constants import DEFAULT_MAX_TOKENS
from litellm.litellm_core_utils.prompt_templates.factory import ( from litellm.litellm_core_utils.prompt_templates.factory import (
custom_prompt, custom_prompt,
prompt_factory, prompt_factory,
@ -65,7 +66,9 @@ class AnthropicTextConfig(BaseConfig):
def __init__( def __init__(
self, self,
max_tokens_to_sample: Optional[int] = 256, # anthropic requires a default max_tokens_to_sample: Optional[
int
] = DEFAULT_MAX_TOKENS, # anthropic requires a default
stop_sequences: Optional[list] = None, stop_sequences: Optional[list] = None,
temperature: Optional[int] = None, temperature: Optional[int] = None,
top_p: Optional[int] = None, top_p: Optional[int] = None,

View file

@ -7,7 +7,7 @@ import httpx # type: ignore
from openai import APITimeoutError, AsyncAzureOpenAI, AzureOpenAI from openai import APITimeoutError, AsyncAzureOpenAI, AzureOpenAI
import litellm import litellm
from litellm.constants import DEFAULT_MAX_RETRIES from litellm.constants import AZURE_OPERATION_POLLING_TIMEOUT, DEFAULT_MAX_RETRIES
from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
from litellm.litellm_core_utils.logging_utils import track_llm_api_timing from litellm.litellm_core_utils.logging_utils import track_llm_api_timing
from litellm.llms.custom_httpx.http_handler import ( from litellm.llms.custom_httpx.http_handler import (
@ -859,7 +859,7 @@ class AzureChatCompletion(BaseAzureLLM, BaseLLM):
await response.aread() await response.aread()
timeout_secs: int = 120 timeout_secs: int = AZURE_OPERATION_POLLING_TIMEOUT
start_time = time.time() start_time = time.time()
if "status" not in response.json(): if "status" not in response.json():
raise Exception( raise Exception(
@ -959,7 +959,7 @@ class AzureChatCompletion(BaseAzureLLM, BaseLLM):
response.read() response.read()
timeout_secs: int = 120 timeout_secs: int = AZURE_OPERATION_POLLING_TIMEOUT
start_time = time.time() start_time = time.time()
if "status" not in response.json(): if "status" not in response.json():
raise Exception( raise Exception(

View file

@ -7,6 +7,10 @@ from litellm.litellm_core_utils.prompt_templates.factory import (
convert_to_azure_openai_messages, convert_to_azure_openai_messages,
) )
from litellm.llms.base_llm.chat.transformation import BaseLLMException from litellm.llms.base_llm.chat.transformation import BaseLLMException
from litellm.types.llms.azure import (
API_VERSION_MONTH_SUPPORTED_RESPONSE_FORMAT,
API_VERSION_YEAR_SUPPORTED_RESPONSE_FORMAT,
)
from litellm.types.utils import ModelResponse from litellm.types.utils import ModelResponse
from litellm.utils import supports_response_schema from litellm.utils import supports_response_schema
@ -123,7 +127,10 @@ class AzureOpenAIConfig(BaseConfig):
- check if api_version is supported for response_format - check if api_version is supported for response_format
""" """
is_supported = int(api_version_year) <= 2024 and int(api_version_month) >= 8 is_supported = (
int(api_version_year) <= API_VERSION_YEAR_SUPPORTED_RESPONSE_FORMAT
and int(api_version_month) >= API_VERSION_MONTH_SUPPORTED_RESPONSE_FORMAT
)
return is_supported return is_supported

View file

@ -9,7 +9,7 @@ from pydantic import BaseModel
from litellm._logging import verbose_logger from litellm._logging import verbose_logger
from litellm.caching.caching import DualCache from litellm.caching.caching import DualCache
from litellm.constants import BEDROCK_INVOKE_PROVIDERS_LITERAL from litellm.constants import BEDROCK_INVOKE_PROVIDERS_LITERAL, BEDROCK_MAX_POLICY_SIZE
from litellm.litellm_core_utils.dd_tracing import tracer from litellm.litellm_core_utils.dd_tracing import tracer
from litellm.secret_managers.main import get_secret from litellm.secret_managers.main import get_secret
@ -381,7 +381,7 @@ class BaseAWSLLM:
"region_name": aws_region_name, "region_name": aws_region_name,
} }
if sts_response["PackedPolicySize"] > 75: if sts_response["PackedPolicySize"] > BEDROCK_MAX_POLICY_SIZE:
verbose_logger.warning( verbose_logger.warning(
f"The policy size is greater than 75% of the allowed size, PackedPolicySize: {sts_response['PackedPolicySize']}" f"The policy size is greater than 75% of the allowed size, PackedPolicySize: {sts_response['PackedPolicySize']}"
) )

View file

@ -1274,13 +1274,6 @@ class AWSEventStreamDecoder:
def converse_chunk_parser(self, chunk_data: dict) -> ModelResponseStream: def converse_chunk_parser(self, chunk_data: dict) -> ModelResponseStream:
try: try:
verbose_logger.debug("\n\nRaw Chunk: {}\n\n".format(chunk_data)) verbose_logger.debug("\n\nRaw Chunk: {}\n\n".format(chunk_data))
chunk_data["usage"] = {
"inputTokens": 3,
"outputTokens": 392,
"totalTokens": 2191,
"cacheReadInputTokens": 1796,
"cacheWriteInputTokens": 0,
}
text = "" text = ""
tool_use: Optional[ChatCompletionToolCallChunk] = None tool_use: Optional[ChatCompletionToolCallChunk] = None
finish_reason = "" finish_reason = ""

View file

@ -1,6 +1,7 @@
from typing import Optional, Tuple, Union from typing import Optional, Tuple, Union
import litellm import litellm
from litellm.constants import MIN_NON_ZERO_TEMPERATURE
from litellm.llms.openai.chat.gpt_transformation import OpenAIGPTConfig from litellm.llms.openai.chat.gpt_transformation import OpenAIGPTConfig
from litellm.secret_managers.main import get_secret_str from litellm.secret_managers.main import get_secret_str
@ -84,7 +85,7 @@ class DeepInfraConfig(OpenAIGPTConfig):
and value == 0 and value == 0
and model == "mistralai/Mistral-7B-Instruct-v0.1" and model == "mistralai/Mistral-7B-Instruct-v0.1"
): # this model does no support temperature == 0 ): # this model does no support temperature == 0
value = 0.0001 # close to 0 value = MIN_NON_ZERO_TEMPERATURE # close to 0
if param == "tool_choice": if param == "tool_choice":
if ( if (
value != "auto" and value != "none" value != "auto" and value != "none"

View file

@ -4,6 +4,12 @@ For calculating cost of fireworks ai serverless inference models.
from typing import Tuple from typing import Tuple
from litellm.constants import (
FIREWORKS_AI_16_B,
FIREWORKS_AI_56_B_MOE,
FIREWORKS_AI_80_B,
FIREWORKS_AI_176_B_MOE,
)
from litellm.types.utils import Usage from litellm.types.utils import Usage
from litellm.utils import get_model_info from litellm.utils import get_model_info
@ -25,9 +31,9 @@ def get_base_model_for_pricing(model_name: str) -> str:
moe_match = re.search(r"(\d+)x(\d+)b", model_name) moe_match = re.search(r"(\d+)x(\d+)b", model_name)
if moe_match: if moe_match:
total_billion = int(moe_match.group(1)) * int(moe_match.group(2)) total_billion = int(moe_match.group(1)) * int(moe_match.group(2))
if total_billion <= 56: if total_billion <= FIREWORKS_AI_56_B_MOE:
return "fireworks-ai-moe-up-to-56b" return "fireworks-ai-moe-up-to-56b"
elif total_billion <= 176: elif total_billion <= FIREWORKS_AI_176_B_MOE:
return "fireworks-ai-56b-to-176b" return "fireworks-ai-56b-to-176b"
# Check for standard models in the form <number>b # Check for standard models in the form <number>b
@ -37,9 +43,9 @@ def get_base_model_for_pricing(model_name: str) -> str:
params_billion = float(params_match) params_billion = float(params_match)
# Determine the category based on the number of parameters # Determine the category based on the number of parameters
if params_billion <= 16.0: if params_billion <= FIREWORKS_AI_16_B:
return "fireworks-ai-up-to-16b" return "fireworks-ai-up-to-16b"
elif params_billion <= 80.0: elif params_billion <= FIREWORKS_AI_80_B:
return "fireworks-ai-16b-80b" return "fireworks-ai-16b-80b"
# If no matches, return the original model_name # If no matches, return the original model_name

View file

@ -2,6 +2,7 @@ from typing import TYPE_CHECKING, Any, List, Literal, Optional, Union
from httpx import Headers, Response from httpx import Headers, Response
from litellm.constants import DEFAULT_MAX_TOKENS
from litellm.llms.base_llm.chat.transformation import BaseConfig, BaseLLMException from litellm.llms.base_llm.chat.transformation import BaseConfig, BaseLLMException
from litellm.types.llms.openai import AllMessageValues from litellm.types.llms.openai import AllMessageValues
from litellm.types.utils import ModelResponse from litellm.types.utils import ModelResponse
@ -27,7 +28,7 @@ class PredibaseConfig(BaseConfig):
decoder_input_details: Optional[bool] = None decoder_input_details: Optional[bool] = None
details: bool = True # enables returning logprobs + best of details: bool = True # enables returning logprobs + best of
max_new_tokens: int = ( max_new_tokens: int = (
256 # openai default - requests hang if max_new_tokens not given DEFAULT_MAX_TOKENS # openai default - requests hang if max_new_tokens not given
) )
repetition_penalty: Optional[float] = None repetition_penalty: Optional[float] = None
return_full_text: Optional[bool] = ( return_full_text: Optional[bool] = (

View file

@ -4,6 +4,7 @@ import time
from typing import Callable, List, Union from typing import Callable, List, Union
import litellm import litellm
from litellm.constants import REPLICATE_POLLING_DELAY_SECONDS
from litellm.llms.custom_httpx.http_handler import ( from litellm.llms.custom_httpx.http_handler import (
AsyncHTTPHandler, AsyncHTTPHandler,
HTTPHandler, HTTPHandler,
@ -28,7 +29,9 @@ def handle_prediction_response_streaming(
status = "" status = ""
while True and (status not in ["succeeded", "failed", "canceled"]): while True and (status not in ["succeeded", "failed", "canceled"]):
time.sleep(0.5) # prevent being rate limited by replicate time.sleep(
REPLICATE_POLLING_DELAY_SECONDS
) # prevent being rate limited by replicate
print_verbose(f"replicate: polling endpoint: {prediction_url}") print_verbose(f"replicate: polling endpoint: {prediction_url}")
response = http_client.get(prediction_url, headers=headers) response = http_client.get(prediction_url, headers=headers)
if response.status_code == 200: if response.status_code == 200:
@ -77,7 +80,9 @@ async def async_handle_prediction_response_streaming(
status = "" status = ""
while True and (status not in ["succeeded", "failed", "canceled"]): while True and (status not in ["succeeded", "failed", "canceled"]):
await asyncio.sleep(0.5) # prevent being rate limited by replicate await asyncio.sleep(
REPLICATE_POLLING_DELAY_SECONDS
) # prevent being rate limited by replicate
print_verbose(f"replicate: polling endpoint: {prediction_url}") print_verbose(f"replicate: polling endpoint: {prediction_url}")
response = await http_client.get(prediction_url, headers=headers) response = await http_client.get(prediction_url, headers=headers)
if response.status_code == 200: if response.status_code == 200:

View file

@ -3,6 +3,7 @@ from typing import TYPE_CHECKING, Any, List, Optional, Union
import httpx import httpx
import litellm import litellm
from litellm.constants import REPLICATE_MODEL_NAME_WITH_ID_LENGTH
from litellm.litellm_core_utils.prompt_templates.common_utils import ( from litellm.litellm_core_utils.prompt_templates.common_utils import (
convert_content_list_to_str, convert_content_list_to_str,
) )
@ -220,10 +221,11 @@ class ReplicateConfig(BaseConfig):
version_id = self.model_to_version_id(model) version_id = self.model_to_version_id(model)
request_data: dict = {"input": input_data} request_data: dict = {"input": input_data}
if ":" in version_id and len(version_id) > 64: if ":" in version_id and len(version_id) > REPLICATE_MODEL_NAME_WITH_ID_LENGTH:
model_parts = version_id.split(":") model_parts = version_id.split(":")
if ( if (
len(model_parts) > 1 and len(model_parts[1]) == 64 len(model_parts) > 1
and len(model_parts[1]) == REPLICATE_MODEL_NAME_WITH_ID_LENGTH
): ## checks if model name has a 64 digit code - e.g. "meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3" ): ## checks if model name has a 64 digit code - e.g. "meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3"
request_data["version"] = model_parts[1] request_data["version"] = model_parts[1]

View file

@ -4,6 +4,16 @@ Handles calculating cost for together ai models
import re import re
from litellm.constants import (
TOGETHER_AI_4_B,
TOGETHER_AI_8_B,
TOGETHER_AI_21_B,
TOGETHER_AI_41_B,
TOGETHER_AI_80_B,
TOGETHER_AI_110_B,
TOGETHER_AI_EMBEDDING_150_M,
TOGETHER_AI_EMBEDDING_350_M,
)
from litellm.types.utils import CallTypes from litellm.types.utils import CallTypes
@ -31,17 +41,17 @@ def get_model_params_and_category(model_name, call_type: CallTypes) -> str:
else: else:
return model_name return model_name
# Determine the category based on the number of parameters # Determine the category based on the number of parameters
if params_billion <= 4.0: if params_billion <= TOGETHER_AI_4_B:
category = "together-ai-up-to-4b" category = "together-ai-up-to-4b"
elif params_billion <= 8.0: elif params_billion <= TOGETHER_AI_8_B:
category = "together-ai-4.1b-8b" category = "together-ai-4.1b-8b"
elif params_billion <= 21.0: elif params_billion <= TOGETHER_AI_21_B:
category = "together-ai-8.1b-21b" category = "together-ai-8.1b-21b"
elif params_billion <= 41.0: elif params_billion <= TOGETHER_AI_41_B:
category = "together-ai-21.1b-41b" category = "together-ai-21.1b-41b"
elif params_billion <= 80.0: elif params_billion <= TOGETHER_AI_80_B:
category = "together-ai-41.1b-80b" category = "together-ai-41.1b-80b"
elif params_billion <= 110.0: elif params_billion <= TOGETHER_AI_110_B:
category = "together-ai-81.1b-110b" category = "together-ai-81.1b-110b"
if category is not None: if category is not None:
return category return category
@ -69,9 +79,9 @@ def get_model_params_and_category_embeddings(model_name) -> str:
else: else:
return model_name return model_name
# Determine the category based on the number of parameters # Determine the category based on the number of parameters
if params_million <= 150: if params_million <= TOGETHER_AI_EMBEDDING_150_M:
category = "together-ai-embedding-up-to-150m" category = "together-ai-embedding-up-to-150m"
elif params_million <= 350: elif params_million <= TOGETHER_AI_EMBEDDING_350_M:
category = "together-ai-embedding-151m-to-350m" category = "together-ai-embedding-151m-to-350m"
if category is not None: if category is not None:
return category return category

View file

@ -7,6 +7,7 @@ from typing import Any, AsyncIterator, Dict, Iterator, List, Literal, Optional,
from httpx import Headers, Response from httpx import Headers, Response
from litellm.constants import DEFAULT_MAX_TOKENS_FOR_TRITON
from litellm.litellm_core_utils.prompt_templates.factory import prompt_factory from litellm.litellm_core_utils.prompt_templates.factory import prompt_factory
from litellm.llms.base_llm.base_model_iterator import BaseModelResponseIterator from litellm.llms.base_llm.base_model_iterator import BaseModelResponseIterator
from litellm.llms.base_llm.chat.transformation import ( from litellm.llms.base_llm.chat.transformation import (
@ -195,7 +196,9 @@ class TritonGenerateConfig(TritonConfig):
data_for_triton: Dict[str, Any] = { data_for_triton: Dict[str, Any] = {
"text_input": prompt_factory(model=model, messages=messages), "text_input": prompt_factory(model=model, messages=messages),
"parameters": { "parameters": {
"max_tokens": int(optional_params.get("max_tokens", 2000)), "max_tokens": int(
optional_params.get("max_tokens", DEFAULT_MAX_TOKENS_FOR_TRITON)
),
"bad_words": [""], "bad_words": [""],
"stop_words": [""], "stop_words": [""],
}, },

View file

@ -1,6 +1,7 @@
from typing import TYPE_CHECKING, Any, Optional from typing import TYPE_CHECKING, Any, Optional
from litellm._logging import verbose_router_logger from litellm._logging import verbose_router_logger
from litellm.constants import MAX_EXCEPTION_MESSAGE_LENGTH
from litellm.router_utils.cooldown_handlers import ( from litellm.router_utils.cooldown_handlers import (
_async_get_cooldown_deployments_with_debug_info, _async_get_cooldown_deployments_with_debug_info,
) )
@ -54,7 +55,7 @@ async def send_llm_exception_alert(
exception_str = str(original_exception) exception_str = str(original_exception)
if litellm_debug_info is not None: if litellm_debug_info is not None:
exception_str += litellm_debug_info exception_str += litellm_debug_info
exception_str += f"\n\n{error_traceback_str[:2000]}" exception_str += f"\n\n{error_traceback_str[:MAX_EXCEPTION_MESSAGE_LENGTH]}"
await litellm_router_instance.slack_alerting_logger.send_alert( await litellm_router_instance.slack_alerting_logger.send_alert(
message=f"LLM API call failed: `{exception_str}`", message=f"LLM API call failed: `{exception_str}`",

View file

@ -1,6 +1,8 @@
from enum import Enum from enum import Enum
from typing import Optional, TypedDict from typing import Optional, TypedDict
DD_MAX_BATCH_SIZE = 1000
class DataDogStatus(str, Enum): class DataDogStatus(str, Enum):
INFO = "info" INFO = "info"

View file

@ -8,6 +8,10 @@ else:
VertexBase = Any VertexBase = Any
GCS_DEFAULT_BATCH_SIZE = 2048
GCS_DEFAULT_FLUSH_INTERVAL_SECONDS = 20
class GCSLoggingConfig(TypedDict): class GCSLoggingConfig(TypedDict):
""" """
Internal LiteLLM Config for GCS Bucket logging Internal LiteLLM Config for GCS Bucket logging

View file

@ -7,6 +7,9 @@ from pydantic import BaseModel, Field
from litellm.types.utils import LiteLLMPydanticObjectBase from litellm.types.utils import LiteLLMPydanticObjectBase
SLACK_ALERTING_THRESHOLD_5_PERCENT = 0.05
SLACK_ALERTING_THRESHOLD_15_PERCENT = 0.15
class BaseOutageModel(TypedDict): class BaseOutageModel(TypedDict):
alerts: List[int] alerts: List[int]

View file

@ -0,0 +1,2 @@
API_VERSION_YEAR_SUPPORTED_RESPONSE_FORMAT = 2024
API_VERSION_MONTH_SUPPORTED_RESPONSE_FORMAT = 8

View file

@ -0,0 +1 @@

View file

@ -35,6 +35,16 @@ ALLOWED_NUMBERS = {
25, 25,
10000, 10000,
60000, 60000,
8,
2048,
16000000000,
16,
16383,
14,
24,
128000,
0.01,
20,
} }
# Add all standard HTTP status codes # Add all standard HTTP status codes
@ -55,16 +65,23 @@ HTTP_STATUS_CODES = {
402, # Payment Required 402, # Payment Required
403, # Forbidden 403, # Forbidden
404, # Not Found 404, # Not Found
406, # Not Acceptable
408, # Request Timeout 408, # Request Timeout
409, # Conflict 409, # Conflict
413, # Payload Too Large
422, # Unprocessable Entity 422, # Unprocessable Entity
424, # Failed Dependency
429, # Too Many Requests 429, # Too Many Requests
498, # Invalid Token
499, # Client Closed Request 499, # Client Closed Request
500, # Internal Server Error 500, # Internal Server Error
501, # Not Implemented 501, # Not Implemented
502, # Bad Gateway 502, # Bad Gateway
503, # Service Unavailable 503, # Service Unavailable
504, # Gateway Timeout 504, # Gateway Timeout
520, # Web server is returning an unknown error
522, # Connection timed out
524, # A timeout occurred
529, # Site is overloaded 529, # Site is overloaded
} }
@ -112,7 +129,13 @@ def check_file(filename):
def main(): def main():
exit_code = 0 exit_code = 0
folder = "../../litellm" folder = "../../litellm"
ignore_files = ["constants.py", "proxy_cli.py"] ignore_files = [
"constants.py",
"proxy_cli.py",
"token_counter.py",
"mock_functions.py",
"duration_parser.py",
]
ignore_folder = "types" ignore_folder = "types"
for root, dirs, files in os.walk(folder): for root, dirs, files in os.walk(folder):
for filename in files: for filename in files:

View file

@ -1,139 +0,0 @@
ERROR in ../../litellm/integrations/weights_biases.py: Hardcoded numbers detected:
Line 10: 8
ERROR in ../../litellm/integrations/gcs_bucket/gcs_bucket.py: Hardcoded numbers detected:
Line 23: 2048
Line 24: 20
ERROR in ../../litellm/integrations/datadog/datadog.py: Hardcoded numbers detected:
Line 165: 413
ERROR in ../../litellm/integrations/SlackAlerting/slack_alerting.py: Hardcoded numbers detected:
Line 649: 0.05
Line 652: 0.15
Line 1718: 24
ERROR in ../../litellm/integrations/opik/utils.py: Hardcoded numbers detected:
Line 14: 16000000000
Line 16: 16
Line 29: 16383
Line 33: 14
ERROR in ../../litellm/litellm_core_utils/token_counter.py: Hardcoded numbers detected:
Line 100: 768
Line 101: 2000
Line 104: 768
Line 104: 768
Line 135: 512
Line 135: 512
Line 148: 8
Line 157: 8
Line 160: 8
Line 192: 16
Line 192: 24
Line 202: 192
Line 202: 207
Line 202: 196
Line 205: 255
Line 215: 16
Line 216: 24
Line 216: 27
Line 217: 27
Line 220: 16
Line 221: 26
Line 221: 28
Line 221: 16383
Line 222: 28
Line 222: 16383
Line 225: 16
Line 226: 21
Line 227: 16383
Line 228: 14
Line 228: 16383
Line 238: 85
ERROR in ../../litellm/litellm_core_utils/litellm_logging.py: Hardcoded numbers detected:
Line 3681: 16
Line 3747: 20
ERROR in ../../litellm/litellm_core_utils/mock_functions.py: Hardcoded numbers detected:
Line 14: 1536
ERROR in ../../litellm/litellm_core_utils/duration_parser.py: Hardcoded numbers detected:
Line 30: 31
Line 59: 86400
Line 61: 604800
ERROR in ../../litellm/litellm_core_utils/get_llm_provider_logic.py: Hardcoded numbers detected:
Line 259: 64
Line 262: 64
ERROR in ../../litellm/litellm_core_utils/exception_mapping_utils.py: Hardcoded numbers detected:
Line 527: 413
Line 617: 413
Line 688: 14
Line 772: 424
Line 1058: 424
Line 1386: 498
Line 1612: 406
Line 1613: 413
Line 1635: 522
Line 1636: 524
Line 1669: 520
Line 1780: 524
ERROR in ../../litellm/litellm_core_utils/llm_cost_calc/tool_call_cost_tracking.py: Hardcoded numbers detected:
Line 135: 2.5
ERROR in ../../litellm/litellm_core_utils/llm_cost_calc/utils.py: Hardcoded numbers detected:
Line 13: 128000
ERROR in ../../litellm/router_utils/handle_error.py: Hardcoded numbers detected:
Line 57: 2000
ERROR in ../../litellm/llms/azure/azure.py: Hardcoded numbers detected:
Line 862: 120
Line 962: 120
ERROR in ../../litellm/llms/azure/common_utils.py: Hardcoded numbers detected:
Line 353: 8
ERROR in ../../litellm/llms/azure/chat/gpt_transformation.py: Hardcoded numbers detected:
Line 126: 2024
Line 126: 8
ERROR in ../../litellm/llms/predibase/chat/transformation.py: Hardcoded numbers detected:
Line 30: 256
Line 96: 0.01
ERROR in ../../litellm/llms/deepinfra/chat/transformation.py: Hardcoded numbers detected:
Line 87: 0.0001
ERROR in ../../litellm/llms/triton/completion/transformation.py: Hardcoded numbers detected:
Line 198: 2000
Line 274: 20
ERROR in ../../litellm/llms/bedrock/base_aws_llm.py: Hardcoded numbers detected:
Line 384: 75
ERROR in ../../litellm/llms/bedrock/chat/invoke_handler.py: Hardcoded numbers detected:
Line 1279: 392
Line 1280: 2191
Line 1281: 1796
ERROR in ../../litellm/llms/fireworks_ai/cost_calculator.py: Hardcoded numbers detected:
Line 28: 56
Line 30: 176
Line 40: 16.0
Line 42: 80.0
ERROR in ../../litellm/llms/replicate/chat/transformation.py: Hardcoded numbers detected:
Line 223: 64
Line 226: 64
ERROR in ../../litellm/llms/replicate/chat/handler.py: Hardcoded numbers detected:
Line 31: 0.5
Line 80: 0.5
ERROR in ../../litellm/llms/anthropic/chat/transformation.py: Hardcoded numbers detected:
Line 53: 4096
Line 66: 4096
ERROR in ../../litellm/llms/anthropic/completion/transformation.py: Hardcoded numbers detected:
Line 68: 256
ERROR in ../../litellm/llms/huggingface/chat/transformation.py: Hardcoded numbers detected:
Line 117: 0.01
ERROR in ../../litellm/llms/together_ai/cost_calculator.py: Hardcoded numbers detected:
Line 36: 8.0
Line 38: 21.0
Line 40: 41.0
Line 42: 80.0
Line 44: 110.0
Line 72: 150
Line 74: 350
ERROR in ../../litellm/llms/openai/openai.py: Hardcoded numbers detected:
Line 2018: 20
Line 2083: 20
ERROR in ../../litellm/llms/sagemaker/completion/transformation.py: Hardcoded numbers detected:
Line 84: 0.01
ERROR in ../../litellm/caching/qdrant_semantic_cache.py: Hardcoded numbers detected:
Line 121: 0.99
Line 135: 1536
ERROR in ../../litellm/caching/caching.py: Hardcoded numbers detected:
Line 409: 0.02
ERROR in ../../litellm/caching/in_memory_cache.py: Hardcoded numbers detected:
Line 55: 512