Merge branch 'main' into litellm_reset_key_budget

2025-04-25 10:44:24 +00:00 · 2024-01-23 18:10:32 -08:00 · 2024-01-23 18:10:32 -08:00 · 9784d03d65
commit 9784d03d65
parent ef760cb21c d6844f43c8
13 changed files with 492 additions and 78 deletions
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -1067,7 +1067,6 @@ class Logging:
            ## if model in model cost map - log the response cost
            ## else set cost to None
            verbose_logger.debug(f"Model={self.model}; result={result}")
-            verbose_logger.debug(f"self.stream: {self.stream}")
            if (
                result is not None
                and (
@ -1109,6 +1108,12 @@ class Logging:
        self, result=None, start_time=None, end_time=None, cache_hit=None, **kwargs
    ):
        verbose_logger.debug(f"Logging Details LiteLLM-Success Call")
+        start_time, end_time, result = self._success_handler_helper_fn(
+            start_time=start_time,
+            end_time=end_time,
+            result=result,
+            cache_hit=cache_hit,
+        )
        # print(f"original response in success handler: {self.model_call_details['original_response']}")
        try:
            verbose_logger.debug(f"success callbacks: {litellm.success_callback}")
@ -1124,6 +1129,8 @@ class Logging:
                        complete_streaming_response = litellm.stream_chunk_builder(
                            self.sync_streaming_chunks,
                            messages=self.model_call_details.get("messages", None),
+                            start_time=start_time,
+                            end_time=end_time,
                        )
                    except:
                        complete_streaming_response = None
@ -1137,13 +1144,19 @@ class Logging:
                self.model_call_details[
                    "complete_streaming_response"
                ] = complete_streaming_response
+                try:
+                    self.model_call_details["response_cost"] = litellm.completion_cost(
+                        completion_response=complete_streaming_response,
+                    )
+                    verbose_logger.debug(
+                        f"Model={self.model}; cost={self.model_call_details['response_cost']}"
+                    )
+                except litellm.NotFoundError as e:
+                    verbose_logger.debug(
+                        f"Model={self.model} not found in completion cost map."
+                    )
+                    self.model_call_details["response_cost"] = None

-            start_time, end_time, result = self._success_handler_helper_fn(
-                start_time=start_time,
-                end_time=end_time,
-                result=result,
-                cache_hit=cache_hit,
-            )
            for callback in litellm.success_callback:
                try:
                    if callback == "lite_debugger":
@ -1487,14 +1500,27 @@ class Logging:
                            end_time=end_time,
                        )
                if callable(callback):  # custom logger functions
-                    await customLogger.async_log_event(
-                        kwargs=self.model_call_details,
-                        response_obj=result,
-                        start_time=start_time,
-                        end_time=end_time,
-                        print_verbose=print_verbose,
-                        callback_func=callback,
-                    )
+                    if self.stream:
+                        if "complete_streaming_response" in self.model_call_details:
+                            await customLogger.async_log_event(
+                                kwargs=self.model_call_details,
+                                response_obj=self.model_call_details[
+                                    "complete_streaming_response"
+                                ],
+                                start_time=start_time,
+                                end_time=end_time,
+                                print_verbose=print_verbose,
+                                callback_func=callback,
+                            )
+                    else:
+                        await customLogger.async_log_event(
+                            kwargs=self.model_call_details,
+                            response_obj=result,
+                            start_time=start_time,
+                            end_time=end_time,
+                            print_verbose=print_verbose,
+                            callback_func=callback,
+                        )
                if callback == "dynamodb":
                    global dynamoLogger
                    if dynamoLogger is None:
@ -2915,17 +2941,25 @@ def cost_per_token(
        )
        return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
    elif model_with_provider in model_cost_ref:
-        print_verbose(f"Looking up model={model_with_provider} in model_cost_map")
+        verbose_logger.debug(
+            f"Looking up model={model_with_provider} in model_cost_map"
+        )
+        verbose_logger.debug(
+            f"applying cost={model_cost_ref[model_with_provider]['input_cost_per_token']} for prompt_tokens={prompt_tokens}"
+        )
        prompt_tokens_cost_usd_dollar = (
            model_cost_ref[model_with_provider]["input_cost_per_token"] * prompt_tokens
        )
+        verbose_logger.debug(
+            f"applying cost={model_cost_ref[model_with_provider]['output_cost_per_token']} for completion_tokens={completion_tokens}"
+        )
        completion_tokens_cost_usd_dollar = (
            model_cost_ref[model_with_provider]["output_cost_per_token"]
            * completion_tokens
        )
        return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
    elif "ft:gpt-3.5-turbo" in model:
-        print_verbose(f"Cost Tracking: {model} is an OpenAI FinteTuned LLM")
+        verbose_logger.debug(f"Cost Tracking: {model} is an OpenAI FinteTuned LLM")
        # fuzzy match ft:gpt-3.5-turbo:abcd-id-cool-litellm
        prompt_tokens_cost_usd_dollar = (
            model_cost_ref["ft:gpt-3.5-turbo"]["input_cost_per_token"] * prompt_tokens
@ -2936,17 +2970,23 @@ def cost_per_token(
        )
        return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
    elif model in litellm.azure_llms:
-        print_verbose(f"Cost Tracking: {model} is an Azure LLM")
+        verbose_logger.debug(f"Cost Tracking: {model} is an Azure LLM")
        model = litellm.azure_llms[model]
+        verbose_logger.debug(
+            f"applying cost={model_cost_ref[model]['input_cost_per_token']} for prompt_tokens={prompt_tokens}"
+        )
        prompt_tokens_cost_usd_dollar = (
            model_cost_ref[model]["input_cost_per_token"] * prompt_tokens
        )
+        verbose_logger.debug(
+            f"applying cost={model_cost_ref[model]['output_cost_per_token']} for completion_tokens={completion_tokens}"
+        )
        completion_tokens_cost_usd_dollar = (
            model_cost_ref[model]["output_cost_per_token"] * completion_tokens
        )
        return prompt_tokens_cost_usd_dollar, completion_tokens_cost_usd_dollar
    elif model in litellm.azure_embedding_models:
-        print_verbose(f"Cost Tracking: {model} is an Azure Embedding Model")
+        verbose_logger.debug(f"Cost Tracking: {model} is an Azure Embedding Model")
        model = litellm.azure_embedding_models[model]
        prompt_tokens_cost_usd_dollar = (
            model_cost_ref[model]["input_cost_per_token"] * prompt_tokens
@ -7061,6 +7101,7 @@ class CustomStreamWrapper:
        self._hidden_params = {
            "model_id": (_model_info.get("id", None))
        }  # returned as x-litellm-model-id response header in proxy
+        self.response_id = None

    def __iter__(self):
        return self
@ -7633,6 +7674,10 @@ class CustomStreamWrapper:

    def chunk_creator(self, chunk):
        model_response = ModelResponse(stream=True, model=self.model)
+        if self.response_id is not None:
+            model_response.id = self.response_id
+        else:
+            self.response_id = model_response.id
        model_response._hidden_params["custom_llm_provider"] = self.custom_llm_provider
        model_response.choices = [StreamingChoices()]
        model_response.choices[0].finish_reason = None
@ -7752,10 +7797,8 @@ class CustomStreamWrapper:
                    ]
                    self.sent_last_chunk = True
            elif self.custom_llm_provider == "sagemaker":
-                print_verbose(f"ENTERS SAGEMAKER STREAMING")
-                new_chunk = next(self.completion_stream)
-
-                completion_obj["content"] = new_chunk
+                print_verbose(f"ENTERS SAGEMAKER STREAMING for chunk {chunk}")
+                completion_obj["content"] = chunk
            elif self.custom_llm_provider == "petals":
                if len(self.completion_stream) == 0:
                    if self.sent_last_chunk:
@ -7874,7 +7917,7 @@ class CustomStreamWrapper:
                            completion_obj["role"] = "assistant"
                            self.sent_first_chunk = True
                        model_response.choices[0].delta = Delta(**completion_obj)
-                    print_verbose(f"model_response: {model_response}")
+                    print_verbose(f"returning model_response: {model_response}")
                    return model_response
                else:
                    return