(litellm SDK perf improvement) - use verbose_logger.debug and _cached_get_model_info_helper in _response_cost_calculator (#7720)

* define _cached_get_model_info_helper * use _cached_get_model_info_helper
2025-04-25 10:44:24 +00:00 · 2025-01-12 15:27:54 -08:00 · 2025-01-12 15:27:54 -08:00 · 6518bc70a0
commit 6518bc70a0
parent 15b52039d2
2 changed files with 29 additions and 9 deletions
--- a/litellm/cost_calculator.py
+++ b/litellm/cost_calculator.py
@ -60,8 +60,7 @@ from litellm.utils import (
    ModelResponse,
    TextCompletionResponse,
    TranscriptionResponse,
-    _get_model_info_helper,
+    _cached_get_model_info_helper,
    print_verbose,
    token_counter,
 )
@ -279,7 +278,7 @@ def cost_per_token(  # noqa: PLR0915
    elif custom_llm_provider == "deepseek":
        return deepseek_cost_per_token(model=model, usage=usage_block)
    else:
-        model_info = _get_model_info_helper(
+        model_info = _cached_get_model_info_helper(
            model=model, custom_llm_provider=custom_llm_provider
        )
@ -292,8 +291,11 @@ def cost_per_token(  # noqa: PLR0915
            model_info.get("input_cost_per_second", None) is not None
            and response_time_ms is not None
        ):
-            print_verbose(
+            verbose_logger.debug(
-                f"For model={model} - input_cost_per_second: {model_info.get('input_cost_per_second')}; response time: {response_time_ms}"
+                "For model=%s - input_cost_per_second: %s; response time: %s",
                model,
                model_info.get("input_cost_per_second", None),
                response_time_ms,
            )
            ## COST PER SECOND ##
            prompt_tokens_cost_usd_dollar = (
@ -308,16 +310,22 @@ def cost_per_token(  # noqa: PLR0915
            model_info.get("output_cost_per_second", None) is not None
            and response_time_ms is not None
        ):
-            print_verbose(
+            verbose_logger.debug(
-                f"For model={model} - output_cost_per_second: {model_info.get('output_cost_per_second')}; response time: {response_time_ms}"
+                "For model=%s - output_cost_per_second: %s; response time: %s",
                model,
                model_info.get("output_cost_per_second", None),
                response_time_ms,
            )
            ## COST PER SECOND ##
            completion_tokens_cost_usd_dollar = (
                model_info["output_cost_per_second"] * response_time_ms / 1000  # type: ignore
            )
-        print_verbose(
+        verbose_logger.debug(
-            f"Returned custom cost for model={model} - prompt_tokens_cost_usd_dollar: {prompt_tokens_cost_usd_dollar}, completion_tokens_cost_usd_dollar: {completion_tokens_cost_usd_dollar}"
+            "Returned custom cost for model=%s - prompt_tokens_cost_usd_dollar: %s, completion_tokens_cost_usd_dollar: %s",
            model,
            prompt_tokens_cost_usd_dollar,
            completion_tokens_cost_usd_dollar,
        )
        return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -4013,6 +4013,18 @@ def _get_max_position_embeddings(model_name: str) -> Optional[int]:
        return None
@lru_cache(maxsize=16)
 def _cached_get_model_info_helper(
    model: str, custom_llm_provider: Optional[str]
 ) -> ModelInfoBase:
    """
    _get_model_info_helper wrapped with lru_cache
    Speed Optimization to hit high RPS
    """
    return _get_model_info_helper(model=model, custom_llm_provider=custom_llm_provider)
 def _get_model_info_helper(  # noqa: PLR0915
    model: str, custom_llm_provider: Optional[str] = None
 ) -> ModelInfoBase: