(litellm SDK perf improvement) - use verbose_logger.debug and _cached_get_model_info_helper in _response_cost_calculator (#7720)

* define _cached_get_model_info_helper

* use _cached_get_model_info_helper
This commit is contained in:
Ishaan Jaff 2025-01-12 15:27:54 -08:00 committed by GitHub
parent 15b52039d2
commit 6518bc70a0
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 29 additions and 9 deletions

View file

@ -60,8 +60,7 @@ from litellm.utils import (
ModelResponse, ModelResponse,
TextCompletionResponse, TextCompletionResponse,
TranscriptionResponse, TranscriptionResponse,
_get_model_info_helper, _cached_get_model_info_helper,
print_verbose,
token_counter, token_counter,
) )
@ -279,7 +278,7 @@ def cost_per_token( # noqa: PLR0915
elif custom_llm_provider == "deepseek": elif custom_llm_provider == "deepseek":
return deepseek_cost_per_token(model=model, usage=usage_block) return deepseek_cost_per_token(model=model, usage=usage_block)
else: else:
model_info = _get_model_info_helper( model_info = _cached_get_model_info_helper(
model=model, custom_llm_provider=custom_llm_provider model=model, custom_llm_provider=custom_llm_provider
) )
@ -292,8 +291,11 @@ def cost_per_token( # noqa: PLR0915
model_info.get("input_cost_per_second", None) is not None model_info.get("input_cost_per_second", None) is not None
and response_time_ms is not None and response_time_ms is not None
): ):
print_verbose( verbose_logger.debug(
f"For model={model} - input_cost_per_second: {model_info.get('input_cost_per_second')}; response time: {response_time_ms}" "For model=%s - input_cost_per_second: %s; response time: %s",
model,
model_info.get("input_cost_per_second", None),
response_time_ms,
) )
## COST PER SECOND ## ## COST PER SECOND ##
prompt_tokens_cost_usd_dollar = ( prompt_tokens_cost_usd_dollar = (
@ -308,16 +310,22 @@ def cost_per_token( # noqa: PLR0915
model_info.get("output_cost_per_second", None) is not None model_info.get("output_cost_per_second", None) is not None
and response_time_ms is not None and response_time_ms is not None
): ):
print_verbose( verbose_logger.debug(
f"For model={model} - output_cost_per_second: {model_info.get('output_cost_per_second')}; response time: {response_time_ms}" "For model=%s - output_cost_per_second: %s; response time: %s",
model,
model_info.get("output_cost_per_second", None),
response_time_ms,
) )
## COST PER SECOND ## ## COST PER SECOND ##
completion_tokens_cost_usd_dollar = ( completion_tokens_cost_usd_dollar = (
model_info["output_cost_per_second"] * response_time_ms / 1000 # type: ignore model_info["output_cost_per_second"] * response_time_ms / 1000 # type: ignore
) )
print_verbose( verbose_logger.debug(
f"Returned custom cost for model={model} - prompt_tokens_cost_usd_dollar: {prompt_tokens_cost_usd_dollar}, completion_tokens_cost_usd_dollar: {completion_tokens_cost_usd_dollar}" "Returned custom cost for model=%s - prompt_tokens_cost_usd_dollar: %s, completion_tokens_cost_usd_dollar: %s",
model,
prompt_tokens_cost_usd_dollar,
completion_tokens_cost_usd_dollar,
) )
return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar

View file

@ -4013,6 +4013,18 @@ def _get_max_position_embeddings(model_name: str) -> Optional[int]:
return None return None
@lru_cache(maxsize=16)
def _cached_get_model_info_helper(
model: str, custom_llm_provider: Optional[str]
) -> ModelInfoBase:
"""
_get_model_info_helper wrapped with lru_cache
Speed Optimization to hit high RPS
"""
return _get_model_info_helper(model=model, custom_llm_provider=custom_llm_provider)
def _get_model_info_helper( # noqa: PLR0915 def _get_model_info_helper( # noqa: PLR0915
model: str, custom_llm_provider: Optional[str] = None model: str, custom_llm_provider: Optional[str] = None
) -> ModelInfoBase: ) -> ModelInfoBase: