Fix batches api cost tracking + Log batch models in spend logs / standard logging payload (#9077)

* feat(batches/): fix batch cost calculation - ensure it's accurate use the correct cost value - prev. defaulting to non-batch cost * feat(batch_utils.py): log batch models to spend logs + standard logging payload makes it easy to understand how cost was calculated * fix: fix stored payload for test * test: fix test
2025-04-25 10:44:24 +00:00 · 2025-03-08 11:47:25 -08:00 · 2025-03-08 11:47:25 -08:00 · 4330ef8e81
commit 4330ef8e81
parent 8c049dfffc
8 changed files with 110 additions and 7 deletions
--- a/litellm/batches/batch_utils.py
+++ b/litellm/batches/batch_utils.py
@ -4,13 +4,13 @@ from typing import Any, List, Literal, Tuple
 import litellm
 from litellm._logging import verbose_logger
 from litellm.types.llms.openai import Batch
-from litellm.types.utils import Usage
+from litellm.types.utils import CallTypes, Usage


 async def _handle_completed_batch(
    batch: Batch,
    custom_llm_provider: Literal["openai", "azure", "vertex_ai"],
-) -> Tuple[float, Usage]:
+) -> Tuple[float, Usage, List[str]]:
    """Helper function to process a completed batch and handle logging"""
    # Get batch results
    file_content_dictionary = await _get_batch_output_file_content_as_dictionary(
@ -27,7 +27,25 @@ async def _handle_completed_batch(
        custom_llm_provider=custom_llm_provider,
    )

-    return batch_cost, batch_usage
+    batch_models = _get_batch_models_from_file_content(file_content_dictionary)
+
+    return batch_cost, batch_usage, batch_models
+
+
+def _get_batch_models_from_file_content(
+    file_content_dictionary: List[dict],
+) -> List[str]:
+    """
+    Get the models from the file content
+    """
+    batch_models = []
+    for _item in file_content_dictionary:
+        if _batch_response_was_successful(_item):
+            _response_body = _get_response_from_batch_job_output_file(_item)
+            _model = _response_body.get("model")
+            if _model:
+                batch_models.append(_model)
+    return batch_models


 async def _batch_cost_calculator(
@ -105,6 +123,7 @@ def _get_batch_job_cost_from_file_content(
                total_cost += litellm.completion_cost(
                    completion_response=_response_body,
                    custom_llm_provider=custom_llm_provider,
+                    call_type=CallTypes.aretrieve_batch.value,
                )
                verbose_logger.debug("total_cost=%s", total_cost)
        return total_cost
--- a/litellm/cost_calculator.py
+++ b/litellm/cost_calculator.py
@ -239,6 +239,15 @@ def cost_per_token(  # noqa: PLR0915
            custom_llm_provider=custom_llm_provider,
            billed_units=rerank_billed_units,
        )
+    elif (
+        call_type == "aretrieve_batch"
+        or call_type == "retrieve_batch"
+        or call_type == CallTypes.aretrieve_batch
+        or call_type == CallTypes.retrieve_batch
+    ):
+        return batch_cost_calculator(
+            usage=usage_block, model=model, custom_llm_provider=custom_llm_provider
+        )
    elif call_type == "atranscription" or call_type == "transcription":
        return openai_cost_per_second(
            model=model,
@ -960,3 +969,54 @@ def default_image_cost_calculator(
            )

    return cost_info["input_cost_per_pixel"] * height * width * n
+
+
+def batch_cost_calculator(
+    usage: Usage,
+    model: str,
+    custom_llm_provider: Optional[str] = None,
+) -> Tuple[float, float]:
+    """
+    Calculate the cost of a batch job
+    """
+
+    _, custom_llm_provider, _, _ = litellm.get_llm_provider(
+        model=model, custom_llm_provider=custom_llm_provider
+    )
+
+    verbose_logger.info(
+        "Calculating batch cost per token. model=%s, custom_llm_provider=%s",
+        model,
+        custom_llm_provider,
+    )
+
+    try:
+        model_info: Optional[ModelInfo] = litellm.get_model_info(
+            model=model, custom_llm_provider=custom_llm_provider
+        )
+    except Exception:
+        model_info = None
+
+    if not model_info:
+        return 0.0, 0.0
+
+    input_cost_per_token_batches = model_info.get("input_cost_per_token_batches")
+    input_cost_per_token = model_info.get("input_cost_per_token")
+    output_cost_per_token_batches = model_info.get("output_cost_per_token_batches")
+    output_cost_per_token = model_info.get("output_cost_per_token")
+    total_prompt_cost = 0.0
+    total_completion_cost = 0.0
+    if input_cost_per_token_batches:
+        total_prompt_cost = usage.prompt_tokens * input_cost_per_token_batches
+    elif input_cost_per_token:
+        total_prompt_cost = (
+            usage.prompt_tokens * (input_cost_per_token) / 2
+        )  # batch cost is usually half of the regular token cost
+    if output_cost_per_token_batches:
+        total_completion_cost = usage.completion_tokens * output_cost_per_token_batches
+    elif output_cost_per_token:
+        total_completion_cost = (
+            usage.completion_tokens * (output_cost_per_token) / 2
+        )  # batch cost is usually half of the regular token cost
+
+    return total_prompt_cost, total_completion_cost
--- a/litellm/litellm_core_utils/litellm_logging.py
+++ b/litellm/litellm_core_utils/litellm_logging.py
@ -1613,11 +1613,12 @@ class Logging(LiteLLMLoggingBaseClass):
            result, LiteLLMBatch
        ):

-            response_cost, batch_usage = await _handle_completed_batch(
+            response_cost, batch_usage, batch_models = await _handle_completed_batch(
                batch=result, custom_llm_provider=self.custom_llm_provider
            )

            result._hidden_params["response_cost"] = response_cost
+            result._hidden_params["batch_models"] = batch_models
            result.usage = batch_usage

        start_time, end_time, result = self._success_handler_helper_fn(
@ -3213,6 +3214,7 @@ class StandardLoggingPayloadSetup:
            response_cost=None,
            additional_headers=None,
            litellm_overhead_time_ms=None,
+            batch_models=None,
        )
        if hidden_params is not None:
            for key in StandardLoggingHiddenParams.__annotations__.keys():
@ -3326,6 +3328,7 @@ def get_standard_logging_object_payload(
                        api_base=None,
                        response_cost=None,
                        litellm_overhead_time_ms=None,
+                        batch_models=None,
                    )
                )

@ -3610,6 +3613,7 @@ def create_dummy_standard_logging_payload() -> StandardLoggingPayload:
        response_cost=None,
        additional_headers=None,
        litellm_overhead_time_ms=None,
+        batch_models=None,
    )

    # Convert numeric values to appropriate types
--- a/litellm/proxy/_types.py
+++ b/litellm/proxy/_types.py
@ -1897,6 +1897,7 @@ class SpendLogsMetadata(TypedDict):
    applied_guardrails: Optional[List[str]]
    status: StandardLoggingPayloadStatus
    proxy_server_request: Optional[str]
+    batch_models: Optional[List[str]]
    error_information: Optional[StandardLoggingPayloadErrorInformation]


--- a/litellm/proxy/spend_tracking/spend_tracking_utils.py
+++ b/litellm/proxy/spend_tracking/spend_tracking_utils.py
@ -35,7 +35,9 @@ def _is_master_key(api_key: str, _master_key: Optional[str]) -> bool:


 def _get_spend_logs_metadata(
-    metadata: Optional[dict], applied_guardrails: Optional[List[str]] = None
+    metadata: Optional[dict],
+    applied_guardrails: Optional[List[str]] = None,
+    batch_models: Optional[List[str]] = None,
 ) -> SpendLogsMetadata:
    if metadata is None:
        return SpendLogsMetadata(
@ -52,6 +54,7 @@ def _get_spend_logs_metadata(
            status=None or "success",
            error_information=None,
            proxy_server_request=None,
+            batch_models=None,
        )
    verbose_proxy_logger.debug(
        "getting payload for SpendLogs, available keys in metadata: "
@ -67,7 +70,7 @@ def _get_spend_logs_metadata(
        }
    )
    clean_metadata["applied_guardrails"] = applied_guardrails
-
+    clean_metadata["batch_models"] = batch_models
    return clean_metadata


@ -192,6 +195,11 @@ def get_logging_payload(  # noqa: PLR0915
            if standard_logging_payload is not None
            else None
        ),
+        batch_models=(
+            standard_logging_payload.get("hidden_params", {}).get("batch_models", None)
+            if standard_logging_payload is not None
+            else None
+        ),
    )

    special_usage_fields = ["completion_tokens", "prompt_tokens", "total_tokens"]
--- a/litellm/types/utils.py
+++ b/litellm/types/utils.py
@ -117,6 +117,8 @@ class ModelInfoBase(ProviderSpecificModelInfo, total=False):
    input_cost_per_audio_per_second: Optional[float]  # only for vertex ai models
    input_cost_per_video_per_second: Optional[float]  # only for vertex ai models
    input_cost_per_second: Optional[float]  # for OpenAI Speech models
+    input_cost_per_token_batches: Optional[float]
+    output_cost_per_token_batches: Optional[float]
    output_cost_per_token: Required[float]
    output_cost_per_character: Optional[float]  # only for vertex ai models
    output_cost_per_audio_token: Optional[float]
@ -213,6 +215,8 @@ CallTypesLiteral = Literal[
    "acreate_batch",
    "pass_through_endpoint",
    "anthropic_messages",
+    "aretrieve_batch",
+    "retrieve_batch",
 ]


@ -1585,6 +1589,7 @@ class StandardLoggingHiddenParams(TypedDict):
    response_cost: Optional[str]
    litellm_overhead_time_ms: Optional[float]
    additional_headers: Optional[StandardLoggingAdditionalHeaders]
+    batch_models: Optional[List[str]]


 class StandardLoggingModelInformation(TypedDict):
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -4408,6 +4408,12 @@ def _get_model_info_helper(  # noqa: PLR0915
                input_cost_per_audio_token=_model_info.get(
                    "input_cost_per_audio_token", None
                ),
+                input_cost_per_token_batches=_model_info.get(
+                    "input_cost_per_token_batches"
+                ),
+                output_cost_per_token_batches=_model_info.get(
+                    "output_cost_per_token_batches"
+                ),
                output_cost_per_token=_output_cost_per_token,
                output_cost_per_audio_token=_model_info.get(
                    "output_cost_per_audio_token", None
--- a/tests/logging_callback_tests/gcs_pub_sub_body/spend_logs_payload.json
+++ b/tests/logging_callback_tests/gcs_pub_sub_body/spend_logs_payload.json
@ -9,7 +9,7 @@
    "model": "gpt-4o",
    "user": "",
    "team_id": "",
-    "metadata": "{\"applied_guardrails\": [], \"additional_usage_values\": {\"completion_tokens_details\": null, \"prompt_tokens_details\": null}}",
+    "metadata": "{\"applied_guardrails\": [], \"batch_models\": null, \"additional_usage_values\": {\"completion_tokens_details\": null, \"prompt_tokens_details\": null}}",
    "cache_key": "Cache OFF",
    "spend": 0.00022500000000000002,
    "total_tokens": 30,