From 2488e4b45ffb28ff1d04146a28fddcdec749665f Mon Sep 17 00:00:00 2001 From: Krish Dholakia Date: Sat, 21 Sep 2024 21:47:50 -0700 Subject: [PATCH] Cost tracking improvements (#5828) * feat(litellm_logging.py): update standard logging payload to include debug information for cost failures Also includes fixes for cohere rerank cost tracking + databricks llama2 model cost tracking Easier to repro cost failures and improve reliability in prod * fix(proxy_server.py): emit cost failure debug info for slack alerting Improves debug information for cost tracking failures, on slack alerting --- litellm/cost_calculator.py | 45 ++++++----- litellm/litellm_core_utils/litellm_logging.py | 75 ++++++++++++++----- litellm/llms/databricks/cost_calculator.py | 4 + litellm/proxy/_new_secret_config.yaml | 8 ++ litellm/proxy/proxy_server.py | 10 ++- litellm/types/utils.py | 20 +++++ 6 files changed, 117 insertions(+), 45 deletions(-) diff --git a/litellm/cost_calculator.py b/litellm/cost_calculator.py index a176190d0..b55862aaf 100644 --- a/litellm/cost_calculator.py +++ b/litellm/cost_calculator.py @@ -250,6 +250,13 @@ def cost_per_token( ) ) return prompt_cost, completion_cost + elif call_type == "arerank" or call_type == "rerank": + completion_tokens_cost_usd_dollar = rerank_cost( + model=model, + custom_llm_provider=custom_llm_provider, + ) + prompt_tokens_cost_usd_dollar = 0 + return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar elif model in model_cost_ref: print_verbose(f"Success: model={model} in model_cost_map") print_verbose( @@ -689,7 +696,18 @@ def completion_cost( call_type == CallTypes.speech.value or call_type == CallTypes.aspeech.value ): prompt_characters = litellm.utils._count_characters(text=prompt) - + elif ( + call_type == CallTypes.rerank.value or call_type == CallTypes.arerank.value + ): + if completion_response is not None and isinstance( + completion_response, RerankResponse + ): + meta_obj = completion_response.meta + billed_units = meta_obj.get("billed_units", {}) or {} + search_units = ( + billed_units.get("search_units") or 1 + ) # cohere charges per request by default. + completion_tokens = search_units # Calculate cost based on prompt_tokens, completion_tokens if ( "togethercomputer" in model @@ -794,7 +812,7 @@ def response_cost_calculator( ) -> Optional[float]: """ Returns - - float or None: cost of response OR none if error. + - float or None: cost of response """ try: response_cost: float = 0.0 @@ -810,15 +828,6 @@ def response_cost_calculator( call_type=call_type, custom_llm_provider=custom_llm_provider, ) - elif isinstance(response_object, RerankResponse) and ( - call_type == "arerank" or call_type == "rerank" - ): - response_cost = rerank_cost( - rerank_response=response_object, - model=model, - call_type=call_type, - custom_llm_provider=custom_llm_provider, - ) else: if custom_pricing is True: # override defaults if custom pricing is set base_model = model @@ -831,24 +840,12 @@ def response_cost_calculator( custom_llm_provider=custom_llm_provider, ) return response_cost - except litellm.NotFoundError as e: - verbose_logger.debug( # debug since it can be spammy in logs, for calls - f"Model={model} for LLM Provider={custom_llm_provider} not found in completion cost map." - ) - return None except Exception as e: - verbose_logger.debug( - "litellm.cost_calculator.py::response_cost_calculator - Returning None. Exception occurred - {}/n{}".format( - str(e), traceback.format_exc() - ) - ) - return None + raise e def rerank_cost( - rerank_response: RerankResponse, model: str, - call_type: Literal["rerank", "arerank"], custom_llm_provider: Optional[str], ) -> float: """ diff --git a/litellm/litellm_core_utils/litellm_logging.py b/litellm/litellm_core_utils/litellm_logging.py index 2b5cd2187..44de32b15 100644 --- a/litellm/litellm_core_utils/litellm_logging.py +++ b/litellm/litellm_core_utils/litellm_logging.py @@ -41,6 +41,7 @@ from litellm.types.utils import ( ModelResponse, StandardLoggingHiddenParams, StandardLoggingMetadata, + StandardLoggingModelCostFailureDebugInformation, StandardLoggingModelInformation, StandardLoggingPayload, StandardLoggingPayloadStatus, @@ -574,7 +575,7 @@ class Logging: RerankResponse, ], cache_hit: Optional[bool] = None, - ): + ) -> Optional[float]: """ Calculate response cost using result + logging object variables. @@ -590,22 +591,53 @@ class Logging: if cache_hit is None: cache_hit = self.model_call_details.get("cache_hit", False) - response_cost = litellm.response_cost_calculator( - response_object=result, - model=self.model, - cache_hit=cache_hit, - custom_llm_provider=self.model_call_details.get( - "custom_llm_provider", None - ), - base_model=_get_base_model_from_metadata( - model_call_details=self.model_call_details - ), - call_type=self.call_type, - optional_params=self.optional_params, - custom_pricing=custom_pricing, - ) + try: + response_cost_calculator_kwargs = { + "response_object": result, + "model": self.model, + "cache_hit": cache_hit, + "custom_llm_provider": self.model_call_details.get( + "custom_llm_provider", None + ), + "base_model": _get_base_model_from_metadata( + model_call_details=self.model_call_details + ), + "call_type": self.call_type, + "optional_params": self.optional_params, + "custom_pricing": custom_pricing, + } + except Exception as e: # error creating kwargs for cost calculation + self.model_call_details["response_cost_failure_debug_information"] = ( + StandardLoggingModelCostFailureDebugInformation( + error_str=str(e), + traceback_str=traceback.format_exc(), + ) + ) + return None - return response_cost + try: + response_cost = litellm.response_cost_calculator( + **response_cost_calculator_kwargs + ) + + return response_cost + except Exception as e: # error calculating cost + self.model_call_details["response_cost_failure_debug_information"] = ( + StandardLoggingModelCostFailureDebugInformation( + error_str=str(e), + traceback_str=traceback.format_exc(), + model=response_cost_calculator_kwargs["model"], + cache_hit=response_cost_calculator_kwargs["cache_hit"], + custom_llm_provider=response_cost_calculator_kwargs[ + "custom_llm_provider" + ], + base_model=response_cost_calculator_kwargs["base_model"], + call_type=response_cost_calculator_kwargs["call_type"], + custom_pricing=response_cost_calculator_kwargs["custom_pricing"], + ) + ) + + return None def _success_handler_helper_fn( self, result=None, start_time=None, end_time=None, cache_hit=None @@ -2501,12 +2533,16 @@ def get_standard_logging_object_payload( ) except Exception: verbose_logger.debug( # keep in debug otherwise it will trigger on every call - "Model is not mapped in model cost map. Defaulting to None model_cost_information for standard_logging_payload" + "Model={} is not mapped in model cost map. Defaulting to None model_cost_information for standard_logging_payload".format( + model_cost_name + ) ) model_cost_information = StandardLoggingModelInformation( model_map_key=model_cost_name, model_map_value=None ) + response_cost: float = kwargs.get("response_cost", 0) or 0.0 + payload: StandardLoggingPayload = StandardLoggingPayload( id=str(id), call_type=call_type or "", @@ -2519,7 +2555,7 @@ def get_standard_logging_object_payload( model=kwargs.get("model", "") or "", metadata=clean_metadata, cache_key=cache_key, - response_cost=kwargs.get("response_cost", 0), + response_cost=response_cost, total_tokens=usage.get("total_tokens", 0), prompt_tokens=usage.get("prompt_tokens", 0), completion_tokens=usage.get("completion_tokens", 0), @@ -2537,6 +2573,9 @@ def get_standard_logging_object_payload( hidden_params=clean_hidden_params, model_map_information=model_cost_information, error_str=error_str, + response_cost_failure_debug_info=kwargs.get( + "response_cost_failure_debug_information" + ), ) verbose_logger.debug( diff --git a/litellm/llms/databricks/cost_calculator.py b/litellm/llms/databricks/cost_calculator.py index 695a6055f..5558e133b 100644 --- a/litellm/llms/databricks/cost_calculator.py +++ b/litellm/llms/databricks/cost_calculator.py @@ -49,6 +49,10 @@ def cost_per_token(model: str, usage: Usage) -> Tuple[float, float]: "gte-large-en" ): base_model = "databricks-gte-large-en" + elif model.startswith("databricks/llama-2-70b-chat") or model.startswith( + "llama-2-70b-chat" + ): + base_model = "databricks-llama-2-70b-chat" ## GET MODEL INFO model_info = get_model_info(model=base_model, custom_llm_provider="databricks") diff --git a/litellm/proxy/_new_secret_config.yaml b/litellm/proxy/_new_secret_config.yaml index f130e4918..4e6a493ba 100644 --- a/litellm/proxy/_new_secret_config.yaml +++ b/litellm/proxy/_new_secret_config.yaml @@ -23,6 +23,14 @@ model_list: litellm_params: model: cohere/rerank-english-v3.0 api_key: os.environ/COHERE_API_KEY + - model_name: "databricks/*" + litellm_params: + model: "databricks/*" + api_key: os.environ/DATABRICKS_API_KEY + api_base: os.environ/DATABRICKS_API_BASE + - model_name: "anthropic/*" + litellm_params: + model: "anthropic/*" litellm_settings: diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py index d5abe7478..03ba2e839 100644 --- a/litellm/proxy/proxy_server.py +++ b/litellm/proxy/proxy_server.py @@ -824,11 +824,15 @@ async def _PROXY_track_cost_callback( "User API key and team id and user id missing from custom callback." ) else: - if kwargs["stream"] != True or ( - kwargs["stream"] == True and "complete_streaming_response" in kwargs + if kwargs["stream"] is not True or ( + kwargs["stream"] is True and "complete_streaming_response" in kwargs ): + cost_tracking_failure_debug_info = kwargs.get( + "response_cost_failure_debug_information" + ) + model = kwargs.get("model") raise Exception( - f"Model not in litellm model cost map. Passed model = {kwargs.get('model')} - Add custom pricing - https://docs.litellm.ai/docs/proxy/custom_pricing" + f"Cost tracking failed for model={model}.\nDebug info - {cost_tracking_failure_debug_info}\nAdd custom pricing - https://docs.litellm.ai/docs/proxy/custom_pricing" ) except Exception as e: error_msg = f"error in tracking cost callback - {traceback.format_exc()}" diff --git a/litellm/types/utils.py b/litellm/types/utils.py index 0c92dabf5..9f8c8730b 100644 --- a/litellm/types/utils.py +++ b/litellm/types/utils.py @@ -1281,6 +1281,23 @@ class StandardLoggingModelInformation(TypedDict): model_map_value: Optional[ModelInfo] +class StandardLoggingModelCostFailureDebugInformation(TypedDict, total=False): + """ + Debug information, if cost tracking fails. + + Avoid logging sensitive information like response or optional params + """ + + error_str: Required[str] + traceback_str: Required[str] + model: str + cache_hit: Optional[bool] + custom_llm_provider: Optional[str] + base_model: Optional[str] + call_type: str + custom_pricing: Optional[bool] + + StandardLoggingPayloadStatus = Literal["success", "failure"] @@ -1288,6 +1305,9 @@ class StandardLoggingPayload(TypedDict): id: str call_type: str response_cost: float + response_cost_failure_debug_info: Optional[ + StandardLoggingModelCostFailureDebugInformation + ] status: StandardLoggingPayloadStatus total_tokens: int prompt_tokens: int