feat: add cost tracking + caching for transcription calls

2024-03-09 15:43:38 -08:00 · 2024-03-09 15:43:38 -08:00 · fa45c569fd
commit fa45c569fd
parent e10991e02b
8 changed files with 225 additions and 37 deletions
--- a/litellm/caching.py
+++ b/litellm/caching.py
@ -10,7 +10,7 @@
 import litellm
 import time, logging, asyncio
 import json, traceback, ast, hashlib
-from typing import Optional, Literal, List, Union, Any
+from typing import Optional, Literal, List, Union, Any, BinaryIO
 from openai._models import BaseModel as OpenAIObject
 from litellm._logging import verbose_logger
@ -764,8 +764,24 @@ class Cache:
        password: Optional[str] = None,
        similarity_threshold: Optional[float] = None,
        supported_call_types: Optional[
-            List[Literal["completion", "acompletion", "embedding", "aembedding"]]
+            List[
-        ] = ["completion", "acompletion", "embedding", "aembedding"],
+                Literal[
                    "completion",
                    "acompletion",
                    "embedding",
                    "aembedding",
                    "atranscription",
                    "transcription",
                ]
            ]
        ] = [
            "completion",
            "acompletion",
            "embedding",
            "aembedding",
            "atranscription",
            "transcription",
        ],
        # s3 Bucket, boto3 configuration
        s3_bucket_name: Optional[str] = None,
        s3_region_name: Optional[str] = None,
@ -880,9 +896,18 @@ class Cache:
            "input",
            "encoding_format",
        ]  # embedding kwargs = model, input, user, encoding_format. Model, user are checked in completion_kwargs
-
+        transcription_only_kwargs = [
            "model",
            "file",
            "language",
            "prompt",
            "response_format",
            "temperature",
        ]
        # combined_kwargs - NEEDS to be ordered across get_cache_key(). Do not use a set()
-        combined_kwargs = completion_kwargs + embedding_only_kwargs
+        combined_kwargs = (
            completion_kwargs + embedding_only_kwargs + transcription_only_kwargs
        )
        for param in combined_kwargs:
            # ignore litellm params here
            if param in kwargs:
@ -914,6 +939,17 @@ class Cache:
                    param_value = (
                        caching_group or model_group or kwargs[param]
                    )  # use caching_group, if set then model_group if it exists, else use kwargs["model"]
                elif param == "file":
                    metadata_file_name = kwargs.get("metadata", {}).get(
                        "file_name", None
                    )
                    litellm_params_file_name = kwargs.get("litellm_params", {}).get(
                        "file_name", None
                    )
                    if metadata_file_name is not None:
                        param_value = metadata_file_name
                    elif litellm_params_file_name is not None:
                        param_value = litellm_params_file_name
                else:
                    if kwargs[param] is None:
                        continue  # ignore None params
@ -1143,8 +1179,24 @@ def enable_cache(
    port: Optional[str] = None,
    password: Optional[str] = None,
    supported_call_types: Optional[
-        List[Literal["completion", "acompletion", "embedding", "aembedding"]]
+        List[
-    ] = ["completion", "acompletion", "embedding", "aembedding"],
+            Literal[
                "completion",
                "acompletion",
                "embedding",
                "aembedding",
                "atranscription",
                "transcription",
            ]
        ]
    ] = [
        "completion",
        "acompletion",
        "embedding",
        "aembedding",
        "atranscription",
        "transcription",
    ],
    **kwargs,
 ):
    """
@ -1192,8 +1244,24 @@ def update_cache(
    port: Optional[str] = None,
    password: Optional[str] = None,
    supported_call_types: Optional[
-        List[Literal["completion", "acompletion", "embedding", "aembedding"]]
+        List[
-    ] = ["completion", "acompletion", "embedding", "aembedding"],
+            Literal[
                "completion",
                "acompletion",
                "embedding",
                "aembedding",
                "atranscription",
                "transcription",
            ]
        ]
    ] = [
        "completion",
        "acompletion",
        "embedding",
        "aembedding",
        "atranscription",
        "transcription",
    ],
    **kwargs,
 ):
    """
--- a/litellm/llms/azure.py
+++ b/litellm/llms/azure.py
@ -861,7 +861,8 @@ class AzureChatCompletion(BaseLLM):
            additional_args={"complete_input_dict": data},
            original_response=stringified_response,
        )
-        final_response = convert_to_model_response_object(response_object=stringified_response, model_response_object=model_response, response_type="audio_transcription")  # type: ignore
+        hidden_params = {"model": "whisper-1", "custom_llm_provider": "azure"}
        final_response = convert_to_model_response_object(response_object=stringified_response, model_response_object=model_response, hidden_params=hidden_params, response_type="audio_transcription")  # type: ignore
        return final_response
    async def async_audio_transcriptions(
@ -921,7 +922,8 @@ class AzureChatCompletion(BaseLLM):
                },
                original_response=stringified_response,
            )
-            response = convert_to_model_response_object(response_object=stringified_response, model_response_object=model_response, response_type="audio_transcription")  # type: ignore
+            hidden_params = {"model": "whisper-1", "custom_llm_provider": "azure"}
            response = convert_to_model_response_object(response_object=stringified_response, model_response_object=model_response, hidden_params=hidden_params, response_type="audio_transcription")  # type: ignore
            return response
        except Exception as e:
            ## LOGGING
--- a/litellm/llms/openai.py
+++ b/litellm/llms/openai.py
@ -824,7 +824,8 @@ class OpenAIChatCompletion(BaseLLM):
            additional_args={"complete_input_dict": data},
            original_response=stringified_response,
        )
-        final_response = convert_to_model_response_object(response_object=stringified_response, model_response_object=model_response, response_type="audio_transcription")  # type: ignore
+        hidden_params = {"model": "whisper-1", "custom_llm_provider": "openai"}
        final_response = convert_to_model_response_object(response_object=stringified_response, model_response_object=model_response, hidden_params=hidden_params, response_type="audio_transcription")  # type: ignore
        return final_response
    async def async_audio_transcriptions(
@ -862,7 +863,8 @@ class OpenAIChatCompletion(BaseLLM):
                additional_args={"complete_input_dict": data},
                original_response=stringified_response,
            )
-            return convert_to_model_response_object(response_object=stringified_response, model_response_object=model_response, response_type="audio_transcription")  # type: ignore
+            hidden_params = {"model": "whisper-1", "custom_llm_provider": "openai"}
            return convert_to_model_response_object(response_object=stringified_response, model_response_object=model_response, hidden_params=hidden_params, response_type="audio_transcription")  # type: ignore
        except Exception as e:
            ## LOGGING
            logging_obj.post_call(
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@ -3282,6 +3282,7 @@ async def audio_transcriptions(
            user_api_key_dict, "team_id", None
        )
        data["metadata"]["endpoint"] = str(request.url)
        data["metadata"]["file_name"] = file.filename
        ### TEAM-SPECIFIC PARAMS ###
        if user_api_key_dict.team_id is not None:
@ -3316,7 +3317,7 @@ async def audio_transcriptions(
                data = await proxy_logging_obj.pre_call_hook(
                    user_api_key_dict=user_api_key_dict,
                    data=data,
-                    call_type="moderation",
+                    call_type="audio_transcription",
                )
                ## ROUTE TO CORRECT ENDPOINT ##
--- a/litellm/proxy/utils.py
+++ b/litellm/proxy/utils.py
@ -96,7 +96,11 @@ class ProxyLogging:
        user_api_key_dict: UserAPIKeyAuth,
        data: dict,
        call_type: Literal[
-            "completion", "embeddings", "image_generation", "moderation"
+            "completion",
            "embeddings",
            "image_generation",
            "moderation",
            "audio_transcription",
        ],
    ):
        """
--- a/litellm/tests/test_completion_cost.py
+++ b/litellm/tests/test_completion_cost.py
@ -6,7 +6,12 @@ sys.path.insert(
 )  # Adds the parent directory to the system path
 import time
 import litellm
-from litellm import get_max_tokens, model_cost, open_ai_chat_completion_models
+from litellm import (
    get_max_tokens,
    model_cost,
    open_ai_chat_completion_models,
    TranscriptionResponse,
 )
 import pytest
@ -238,3 +243,57 @@ def test_cost_bedrock_pricing_actual_calls():
        messages=[{"role": "user", "content": "Hey, how's it going?"}],
    )
    assert cost > 0
 def test_whisper_openai():
    litellm.set_verbose = True
    transcription = TranscriptionResponse(
        text="Four score and seven years ago, our fathers brought forth on this continent a new nation, conceived in liberty and dedicated to the proposition that all men are created equal. Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure."
    )
    transcription._hidden_params = {
        "model": "whisper-1",
        "custom_llm_provider": "openai",
        "optional_params": {},
        "model_id": None,
    }
    _total_time_in_seconds = 3
    transcription._response_ms = _total_time_in_seconds * 1000
    cost = litellm.completion_cost(model="whisper-1", completion_response=transcription)
    print(f"cost: {cost}")
    print(f"whisper dict: {litellm.model_cost['whisper-1']}")
    expected_cost = round(
        litellm.model_cost["whisper-1"]["output_cost_per_second"]
        * _total_time_in_seconds,
        5,
    )
    assert cost == expected_cost
 def test_whisper_azure():
    litellm.set_verbose = True
    transcription = TranscriptionResponse(
        text="Four score and seven years ago, our fathers brought forth on this continent a new nation, conceived in liberty and dedicated to the proposition that all men are created equal. Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure."
    )
    transcription._hidden_params = {
        "model": "whisper-1",
        "custom_llm_provider": "azure",
        "optional_params": {},
        "model_id": None,
    }
    _total_time_in_seconds = 3
    transcription._response_ms = _total_time_in_seconds * 1000
    cost = litellm.completion_cost(
        model="azure/azure-whisper", completion_response=transcription
    )
    print(f"cost: {cost}")
    print(f"whisper dict: {litellm.model_cost['whisper-1']}")
    expected_cost = round(
        litellm.model_cost["whisper-1"]["output_cost_per_second"]
        * _total_time_in_seconds,
        5,
    )
    assert cost == expected_cost
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -1168,6 +1168,7 @@ class Logging:
                    isinstance(result, ModelResponse)
                    or isinstance(result, EmbeddingResponse)
                    or isinstance(result, ImageResponse)
                    or isinstance(result, TranscriptionResponse)
                )
                and self.stream != True
            ):  # handle streaming separately
@ -1203,9 +1204,6 @@ class Logging:
                                    model=base_model,
                                )
                            )
                    verbose_logger.debug(
                        f"Model={self.model}; cost={self.model_call_details['response_cost']}"
                    )
                except litellm.NotFoundError as e:
                    verbose_logger.debug(
                        f"Model={self.model} not found in completion cost map."
@ -1236,7 +1234,7 @@ class Logging:
    def success_handler(
        self, result=None, start_time=None, end_time=None, cache_hit=None, **kwargs
    ):
-        verbose_logger.debug(f"Logging Details LiteLLM-Success Call: {cache_hit}")
+        print_verbose(f"Logging Details LiteLLM-Success Call: {cache_hit}")
        start_time, end_time, result = self._success_handler_helper_fn(
            start_time=start_time,
            end_time=end_time,
@ -1681,6 +1679,7 @@ class Logging:
        """
        Implementing async callbacks, to handle asyncio event loop issues when custom integrations need to use async functions.
        """
        print_verbose(f"Logging Details LiteLLM-Async Success Call: {cache_hit}")
        start_time, end_time, result = self._success_handler_helper_fn(
            start_time=start_time, end_time=end_time, result=result, cache_hit=cache_hit
        )
@ -2473,6 +2472,7 @@ def client(original_function):
                and kwargs.get("aembedding", False) != True
                and kwargs.get("acompletion", False) != True
                and kwargs.get("aimg_generation", False) != True
                and kwargs.get("atranscription", False) != True
            ):  # allow users to control returning cached responses from the completion function
                # checking cache
                print_verbose(f"INSIDE CHECKING CACHE")
@ -2875,6 +2875,19 @@ def client(original_function):
                                model_response_object=EmbeddingResponse(),
                                response_type="embedding",
                            )
                        elif call_type == CallTypes.atranscription.value and isinstance(
                            cached_result, dict
                        ):
                            hidden_params = {
                                "model": "whisper-1",
                                "custom_llm_provider": custom_llm_provider,
                            }
                            cached_result = convert_to_model_response_object(
                                response_object=cached_result,
                                model_response_object=TranscriptionResponse(),
                                response_type="audio_transcription",
                                hidden_params=hidden_params,
                            )
                        if kwargs.get("stream", False) == False:
                            # LOG SUCCESS
                            asyncio.create_task(
@ -3001,6 +3014,20 @@ def client(original_function):
                else:
                    return result
            # ADD HIDDEN PARAMS - additional call metadata
            if hasattr(result, "_hidden_params"):
                result._hidden_params["model_id"] = kwargs.get("model_info", {}).get(
                    "id", None
                )
            if (
                isinstance(result, ModelResponse)
                or isinstance(result, EmbeddingResponse)
                or isinstance(result, TranscriptionResponse)
            ):
                result._response_ms = (
                    end_time - start_time
                ).total_seconds() * 1000  # return response latency in ms like openai
            ### POST-CALL RULES ###
            post_call_processing(original_response=result, model=model)
@ -3013,8 +3040,10 @@ def client(original_function):
                )
                and (kwargs.get("cache", {}).get("no-store", False) != True)
            ):
-                if isinstance(result, litellm.ModelResponse) or isinstance(
+                if (
-                    result, litellm.EmbeddingResponse
+                    isinstance(result, litellm.ModelResponse)
                    or isinstance(result, litellm.EmbeddingResponse)
                    or isinstance(result, TranscriptionResponse)
                ):
                    if (
                        isinstance(result, EmbeddingResponse)
@ -3058,18 +3087,7 @@ def client(original_function):
                args=(result, start_time, end_time),
            ).start()
-            # RETURN RESULT
+            # REBUILD EMBEDDING CACHING
            if hasattr(result, "_hidden_params"):
                result._hidden_params["model_id"] = kwargs.get("model_info", {}).get(
                    "id", None
                )
            if isinstance(result, ModelResponse) or isinstance(
                result, EmbeddingResponse
            ):
                result._response_ms = (
                    end_time - start_time
                ).total_seconds() * 1000  # return response latency in ms like openai
            if (
                isinstance(result, EmbeddingResponse)
                and final_embedding_cached_response is not None
@ -3575,6 +3593,20 @@ def cost_per_token(
            completion_tokens_cost_usd_dollar = (
                model_cost_ref[model]["output_cost_per_token"] * completion_tokens
            )
        elif (
            model_cost_ref[model].get("output_cost_per_second", None) is not None
            and response_time_ms is not None
        ):
            print_verbose(
                f"For model={model} - output_cost_per_second: {model_cost_ref[model].get('output_cost_per_second')}; response time: {response_time_ms}"
            )
            ## COST PER SECOND ##
            prompt_tokens_cost_usd_dollar = 0
            completion_tokens_cost_usd_dollar = (
                model_cost_ref[model]["output_cost_per_second"]
                * response_time_ms
                / 1000
            )
        elif (
            model_cost_ref[model].get("input_cost_per_second", None) is not None
            and response_time_ms is not None
@ -3659,6 +3691,8 @@ def completion_cost(
        "text_completion",
        "image_generation",
        "aimage_generation",
        "transcription",
        "atranscription",
    ] = "completion",
    ### REGION ###
    custom_llm_provider=None,
@ -3703,6 +3737,7 @@ def completion_cost(
            and custom_llm_provider == "azure"
        ):
            model = "dall-e-2"  # for dall-e-2, azure expects an empty model name
        # Handle Inputs to completion_cost
        prompt_tokens = 0
        completion_tokens = 0
@ -3717,10 +3752,11 @@ def completion_cost(
            verbose_logger.debug(
                f"completion_response response ms: {completion_response.get('_response_ms')} "
            )
-            model = (
+            model = model or completion_response.get(
-                model or completion_response["model"]
+                "model", None
            )  # check if user passed an override for model, if it's none check completion_response['model']
            if hasattr(completion_response, "_hidden_params"):
                model = completion_response._hidden_params.get("model", model)
                custom_llm_provider = completion_response._hidden_params.get(
                    "custom_llm_provider", ""
                )
@ -3801,6 +3837,7 @@ def completion_cost(
        # see https://replicate.com/pricing
        elif model in litellm.replicate_models or "replicate" in model:
            return get_replicate_completion_pricing(completion_response, total_time)
        (
            prompt_tokens_cost_usd_dollar,
            completion_tokens_cost_usd_dollar,
@ -6314,6 +6351,7 @@ def convert_to_model_response_object(
    stream=False,
    start_time=None,
    end_time=None,
    hidden_params: Optional[dict] = None,
 ):
    try:
        if response_type == "completion" and (
@ -6373,6 +6411,9 @@ def convert_to_model_response_object(
                    end_time - start_time
                ).total_seconds() * 1000
            if hidden_params is not None:
                model_response_object._hidden_params = hidden_params
            return model_response_object
        elif response_type == "embedding" and (
            model_response_object is None
@ -6402,6 +6443,9 @@ def convert_to_model_response_object(
                    end_time - start_time
                ).total_seconds() * 1000  # return response latency in ms like openai
            if hidden_params is not None:
                model_response_object._hidden_params = hidden_params
            return model_response_object
        elif response_type == "image_generation" and (
            model_response_object is None
@ -6419,6 +6463,9 @@ def convert_to_model_response_object(
            if "data" in response_object:
                model_response_object.data = response_object["data"]
            if hidden_params is not None:
                model_response_object._hidden_params = hidden_params
            return model_response_object
        elif response_type == "audio_transcription" and (
            model_response_object is None
@ -6432,6 +6479,9 @@ def convert_to_model_response_object(
            if "text" in response_object:
                model_response_object.text = response_object["text"]
            if hidden_params is not None:
                model_response_object._hidden_params = hidden_params
            return model_response_object
    except Exception as e:
        raise Exception(f"Invalid response object {traceback.format_exc()}")
--- a/tests/test_whisper.py
+++ b/tests/test_whisper.py
@ -31,7 +31,8 @@ def test_transcription():
        model="whisper-1",
        file=audio_file,
    )
-    print(f"transcript: {transcript}")
+    print(f"transcript: {transcript.model_dump()}")
    print(f"transcript: {transcript._hidden_params}")
 # test_transcription()
@ -47,6 +48,7 @@ def test_transcription_azure():
        api_version="2024-02-15-preview",
    )
    print(f"transcript: {transcript}")
    assert transcript.text is not None
    assert isinstance(transcript.text, str)