Merge branch 'BerriAI:main' into fix/groq-custom-pricing-cost

2025-04-25 18:54:30 +00:00 · 2025-04-06 19:15:07 +08:00 · 2025-04-06 19:15:07 +08:00 · fcd2586909
commit fcd2586909
parent 5b77c07b59 52b35cd809
352 changed files with 14316 additions and 6075 deletions
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -57,9 +57,21 @@ import litellm._service_logger  # for storing API inputs, outputs, and metadata
 import litellm.litellm_core_utils
 import litellm.litellm_core_utils.audio_utils.utils
 import litellm.litellm_core_utils.json_validation_rule
+import litellm.llms
+import litellm.llms.gemini
 from litellm.caching._internal_lru_cache import lru_cache_wrapper
 from litellm.caching.caching import DualCache
 from litellm.caching.caching_handler import CachingHandlerResponse, LLMCachingHandler
+from litellm.constants import (
+    DEFAULT_MAX_LRU_CACHE_SIZE,
+    DEFAULT_TRIM_RATIO,
+    FUNCTION_DEFINITION_TOKEN_COUNT,
+    INITIAL_RETRY_DELAY,
+    JITTER,
+    MAX_RETRY_DELAY,
+    MINIMUM_PROMPT_CACHE_TOKEN_COUNT,
+    TOOL_CHOICE_OBJECT_TOKEN_COUNT,
+)
 from litellm.integrations.custom_guardrail import CustomGuardrail
 from litellm.integrations.custom_logger import CustomLogger
 from litellm.litellm_core_utils.core_helpers import (
@ -207,6 +219,7 @@ from litellm.llms.base_llm.base_utils import (
 from litellm.llms.base_llm.chat.transformation import BaseConfig
 from litellm.llms.base_llm.completion.transformation import BaseTextCompletionConfig
 from litellm.llms.base_llm.embedding.transformation import BaseEmbeddingConfig
+from litellm.llms.base_llm.files.transformation import BaseFilesConfig
 from litellm.llms.base_llm.image_variations.transformation import (
    BaseImageVariationConfig,
 )
@ -1259,6 +1272,7 @@ def client(original_function):  # noqa: PLR0915
                logging_obj, kwargs = function_setup(
                    original_function.__name__, rules_obj, start_time, *args, **kwargs
                )
+
            kwargs["litellm_logging_obj"] = logging_obj
            ## LOAD CREDENTIALS
            load_credentials_from_list(kwargs)
@ -1516,7 +1530,7 @@ def _select_tokenizer(
    return _select_tokenizer_helper(model=model)


-@lru_cache(maxsize=128)
+@lru_cache(maxsize=DEFAULT_MAX_LRU_CACHE_SIZE)
 def _select_tokenizer_helper(model: str) -> SelectTokenizerResponse:
    if litellm.disable_hf_tokenizer_download is True:
        return _return_openai_tokenizer(model)
@ -2624,7 +2638,7 @@ def get_optional_params_embeddings(  # noqa: PLR0915
            non_default_params=non_default_params, optional_params={}, kwargs=kwargs
        )
        return optional_params
-    elif custom_llm_provider == "vertex_ai":
+    elif custom_llm_provider == "vertex_ai" or custom_llm_provider == "gemini":
        supported_params = get_supported_openai_params(
            model=model,
            custom_llm_provider="vertex_ai",
@ -2839,6 +2853,7 @@ def get_optional_params(  # noqa: PLR0915
    api_version=None,
    parallel_tool_calls=None,
    drop_params=None,
+    allowed_openai_params: Optional[List[str]] = None,
    reasoning_effort=None,
    additional_drop_params=None,
    messages: Optional[List[AllMessageValues]] = None,
@ -2924,6 +2939,7 @@ def get_optional_params(  # noqa: PLR0915
        "api_version": None,
        "parallel_tool_calls": None,
        "drop_params": None,
+        "allowed_openai_params": None,
        "additional_drop_params": None,
        "messages": None,
        "reasoning_effort": None,
@ -2940,6 +2956,7 @@ def get_optional_params(  # noqa: PLR0915
            and k != "custom_llm_provider"
            and k != "api_version"
            and k != "drop_params"
+            and k != "allowed_openai_params"
            and k != "additional_drop_params"
            and k != "messages"
            and k in default_params
@ -3049,6 +3066,12 @@ def get_optional_params(  # noqa: PLR0915
                tool_function["parameters"] = new_parameters

    def _check_valid_arg(supported_params: List[str]):
+        """
+        Check if the params passed to completion() are supported by the provider
+
+        Args:
+            supported_params: List[str] - supported params from the litellm config
+        """
        verbose_logger.info(
            f"\nLiteLLM completion() model= {model}; provider = {custom_llm_provider}"
        )
@ -3082,7 +3105,7 @@ def get_optional_params(  # noqa: PLR0915
            else:
                raise UnsupportedParamsError(
                    status_code=500,
-                    message=f"{custom_llm_provider} does not support parameters: {unsupported_params}, for model={model}. To drop these, set `litellm.drop_params=True` or for proxy:\n\n`litellm_settings:\n drop_params: true`\n",
+                    message=f"{custom_llm_provider} does not support parameters: {list(unsupported_params.keys())}, for model={model}. To drop these, set `litellm.drop_params=True` or for proxy:\n\n`litellm_settings:\n drop_params: true`\n. \n If you want to use these params dynamically send allowed_openai_params={list(unsupported_params.keys())} in your request.",
                )

    supported_params = get_supported_openai_params(
@ -3092,7 +3115,14 @@ def get_optional_params(  # noqa: PLR0915
        supported_params = get_supported_openai_params(
            model=model, custom_llm_provider="openai"
        )
-    _check_valid_arg(supported_params=supported_params or [])
+
+    supported_params = supported_params or []
+    allowed_openai_params = allowed_openai_params or []
+    supported_params.extend(allowed_openai_params)
+
+    _check_valid_arg(
+        supported_params=supported_params or [],
+    )
    ## raise exception if provider doesn't support passed in param
    if custom_llm_provider == "anthropic":
        ## check if unsupported param passed in
@ -3195,7 +3225,7 @@ def get_optional_params(  # noqa: PLR0915
            ),
        )
    elif custom_llm_provider == "huggingface":
-        optional_params = litellm.HuggingfaceConfig().map_openai_params(
+        optional_params = litellm.HuggingFaceChatConfig().map_openai_params(
            non_default_params=non_default_params,
            optional_params=optional_params,
            model=model,
@ -3731,6 +3761,26 @@ def get_optional_params(  # noqa: PLR0915
            if k not in default_params.keys():
                optional_params[k] = passed_params[k]
    print_verbose(f"Final returned optional params: {optional_params}")
+    optional_params = _apply_openai_param_overrides(
+        optional_params=optional_params,
+        non_default_params=non_default_params,
+        allowed_openai_params=allowed_openai_params,
+    )
+    return optional_params
+
+
+def _apply_openai_param_overrides(
+    optional_params: dict, non_default_params: dict, allowed_openai_params: list
+):
+    """
+    If user passes in allowed_openai_params, apply them to optional_params
+
+    These params will get passed as is to the LLM API since the user opted in to passing them in the request
+    """
+    if allowed_openai_params:
+        for param in allowed_openai_params:
+            if param not in optional_params:
+                optional_params[param] = non_default_params.pop(param, None)
    return optional_params


@ -5296,15 +5346,15 @@ def _calculate_retry_after(
    if retry_after is not None and 0 < retry_after <= 60:
        return retry_after

-    initial_retry_delay = 0.5
-    max_retry_delay = 8.0
+    initial_retry_delay = INITIAL_RETRY_DELAY
+    max_retry_delay = MAX_RETRY_DELAY
    nb_retries = max_retries - remaining_retries

    # Apply exponential backoff, but not more than the max.
    sleep_seconds = min(initial_retry_delay * pow(2.0, nb_retries), max_retry_delay)

    # Apply some jitter, plus-or-minus half a second.
-    jitter = 1 - 0.25 * random.random()
+    jitter = JITTER * random.random()
    timeout = sleep_seconds * jitter
    return timeout if timeout >= min_timeout else min_timeout

@ -5630,7 +5680,7 @@ def shorten_message_to_fit_limit(message, tokens_needed, model: Optional[str]):
 def trim_messages(
    messages,
    model: Optional[str] = None,
-    trim_ratio: float = 0.75,
+    trim_ratio: float = DEFAULT_TRIM_RATIO,
    return_response_tokens: bool = False,
    max_tokens=None,
 ):
@ -5901,9 +5951,10 @@ class ModelResponseIterator:


 class ModelResponseListIterator:
-    def __init__(self, model_responses):
+    def __init__(self, model_responses, delay: Optional[float] = None):
        self.model_responses = model_responses
        self.index = 0
+        self.delay = delay

    # Sync iterator
    def __iter__(self):
@ -5914,6 +5965,8 @@ class ModelResponseListIterator:
            raise StopIteration
        model_response = self.model_responses[self.index]
        self.index += 1
+        if self.delay:
+            time.sleep(self.delay)
        return model_response

    # Async iterator
@ -5925,6 +5978,8 @@ class ModelResponseListIterator:
            raise StopAsyncIteration
        model_response = self.model_responses[self.index]
        self.index += 1
+        if self.delay:
+            await asyncio.sleep(self.delay)
        return model_response


@ -6215,7 +6270,7 @@ class ProviderConfigManager:
        elif litellm.LlmProviders.REPLICATE == provider:
            return litellm.ReplicateConfig()
        elif litellm.LlmProviders.HUGGINGFACE == provider:
-            return litellm.HuggingfaceConfig()
+            return litellm.HuggingFaceChatConfig()
        elif litellm.LlmProviders.TOGETHER_AI == provider:
            return litellm.TogetherAIConfig()
        elif litellm.LlmProviders.OPENROUTER == provider:
@ -6423,6 +6478,19 @@ class ProviderConfigManager:
            return litellm.TopazImageVariationConfig()
        return None

+    @staticmethod
+    def get_provider_files_config(
+        model: str,
+        provider: LlmProviders,
+    ) -> Optional[BaseFilesConfig]:
+        if LlmProviders.GEMINI == provider:
+            from litellm.llms.gemini.files.transformation import (
+                GoogleAIStudioFilesHandler,  # experimental approach, to reduce bloat on __init__.py
+            )
+
+            return GoogleAIStudioFilesHandler()
+        return None
+

 def get_end_user_id_for_cost_tracking(
    litellm_params: dict,
@ -6487,7 +6555,7 @@ def is_prompt_caching_valid_prompt(
            model=model,
            use_default_image_token_count=True,
        )
-        return token_count >= 1024
+        return token_count >= MINIMUM_PROMPT_CACHE_TOKEN_COUNT
    except Exception as e:
        verbose_logger.error(f"Error in is_prompt_caching_valid_prompt: {e}")
        return False