(Feat) Add x-litellm-overhead-duration-ms and "x-litellm-response-duration-ms" in response from LiteLLM (#7899)

* add track_llm_api_timing * add track_llm_api_timing * test_litellm_overhead * use ResponseMetadata class for setting hidden params and response overhead * instrument http handler * fix track_llm_api_timing * track_llm_api_timing * emit response overhead on hidden params * fix resp metadata * fix make_sync_openai_embedding_request * test_aaaaatext_completion_endpoint fixes * _get_value_from_hidden_params * set_hidden_params * test_litellm_overhead * test_litellm_overhead * test_litellm_overhead * fix import * test_litellm_overhead_stream * add LiteLLMLoggingObject * use diff folder for testing * use diff folder for overhead testing * test litellm overhead * use typing * clear typing * test_litellm_overhead * fix async_streaming * update_response_metadata * move test file * pply metadata to the response objec
2025-04-27 03:34:10 +00:00 · 2025-01-21 20:27:55 -08:00 · 2025-01-21 20:27:55 -08:00 · b6f2e659b9
commit b6f2e659b9
parent 63d7d04232
17 changed files with 464 additions and 73 deletions
--- a/litellm/llms/bedrock/chat/invoke_handler.py
+++ b/litellm/llms/bedrock/chat/invoke_handler.py
@ -28,6 +28,7 @@ from litellm import verbose_logger
 from litellm.caching.caching import InMemoryCache
 from litellm.litellm_core_utils.core_helpers import map_finish_reason
 from litellm.litellm_core_utils.litellm_logging import Logging
+from litellm.litellm_core_utils.logging_utils import track_llm_api_timing
 from litellm.litellm_core_utils.prompt_templates.factory import (
    cohere_message_pt,
    construct_tool_use_system_prompt,
@ -171,7 +172,7 @@ async def make_call(
    data: str,
    model: str,
    messages: list,
-    logging_obj,
+    logging_obj: Logging,
    fake_stream: bool = False,
    json_mode: Optional[bool] = False,
 ):
@ -186,6 +187,7 @@ async def make_call(
            headers=headers,
            data=data,
            stream=not fake_stream,
+            logging_obj=logging_obj,
        )

        if response.status_code != 200:
@ -577,7 +579,7 @@ class BedrockLLM(BaseAWSLLM):
        model_response: ModelResponse,
        print_verbose: Callable,
        encoding,
-        logging_obj,
+        logging_obj: Logging,
        optional_params: dict,
        acompletion: bool,
        timeout: Optional[Union[float, httpx.Timeout]],
@ -890,6 +892,7 @@ class BedrockLLM(BaseAWSLLM):
                headers=prepped.headers,  # type: ignore
                data=data,
                stream=stream,
+                logging_obj=logging_obj,
            )

            if response.status_code != 200:
@ -917,7 +920,12 @@ class BedrockLLM(BaseAWSLLM):
            return streaming_response

        try:
-            response = self.client.post(url=proxy_endpoint_url, headers=prepped.headers, data=data)  # type: ignore
+            response = self.client.post(
+                url=proxy_endpoint_url,
+                headers=dict(prepped.headers),
+                data=data,
+                logging_obj=logging_obj,
+            )
            response.raise_for_status()
        except httpx.HTTPStatusError as err:
            error_code = err.response.status_code
@ -949,7 +957,7 @@ class BedrockLLM(BaseAWSLLM):
        data: str,
        timeout: Optional[Union[float, httpx.Timeout]],
        encoding,
-        logging_obj,
+        logging_obj: Logging,
        stream,
        optional_params: dict,
        litellm_params=None,
@ -968,7 +976,13 @@ class BedrockLLM(BaseAWSLLM):
            client = client  # type: ignore

        try:
-            response = await client.post(api_base, headers=headers, data=data)  # type: ignore
+            response = await client.post(
+                api_base,
+                headers=headers,
+                data=data,
+                timeout=timeout,
+                logging_obj=logging_obj,
+            )
            response.raise_for_status()
        except httpx.HTTPStatusError as err:
            error_code = err.response.status_code
@ -990,6 +1004,7 @@ class BedrockLLM(BaseAWSLLM):
            encoding=encoding,
        )

+    @track_llm_api_timing()  # for streaming, we need to instrument the function calling the wrapper
    async def async_streaming(
        self,
        model: str,
@ -1000,7 +1015,7 @@ class BedrockLLM(BaseAWSLLM):
        data: str,
        timeout: Optional[Union[float, httpx.Timeout]],
        encoding,
-        logging_obj,
+        logging_obj: Logging,
        stream,
        optional_params: dict,
        litellm_params=None,