diff --git a/.circleci/config.yml b/.circleci/config.yml
index 3c8ec30b45..480f0e3400 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -669,7 +669,7 @@ jobs:
           paths:
             - batches_coverage.xml
             - batches_coverage
-  secret_manager_testing:
+  litellm_utils_testing:
     docker:
       - image: cimg/python:3.11
         auth:
@@ -697,13 +697,13 @@ jobs:
           command: |
             pwd
             ls
-            python -m pytest -vv tests/secret_manager_tests --cov=litellm --cov-report=xml -x -s -v --junitxml=test-results/junit.xml --durations=5
+            python -m pytest -vv tests/litellm_utils_tests --cov=litellm --cov-report=xml -x -s -v --junitxml=test-results/junit.xml --durations=5
           no_output_timeout: 120m
       - run:
           name: Rename the coverage files
           command: |
-            mv coverage.xml secret_manager_coverage.xml
-            mv .coverage secret_manager_coverage
+            mv coverage.xml litellm_utils_coverage.xml
+            mv .coverage litellm_utils_coverage
 
       # Store test results
       - store_test_results:
@@ -711,8 +711,8 @@ jobs:
       - persist_to_workspace:
           root: .
           paths:
-            - secret_manager_coverage.xml
-            - secret_manager_coverage
+            - litellm_utils_coverage.xml
+            - litellm_utils_coverage
 
   pass_through_unit_testing:
     docker:
@@ -2029,7 +2029,7 @@ workflows:
               only:
                 - main
                 - /litellm_.*/
-      - secret_manager_testing:
+      - litellm_utils_testing:
           filters:
             branches:
               only:
@@ -2057,7 +2057,7 @@ workflows:
           requires:
             - llm_translation_testing
             - batches_testing
-            - secret_manager_testing
+            - litellm_utils_testing
             - pass_through_unit_testing
             - image_gen_testing
             - logging_testing
@@ -2113,7 +2113,7 @@ workflows:
             - test_bad_database_url
             - llm_translation_testing
             - batches_testing
-            - secret_manager_testing
+            - litellm_utils_testing
             - pass_through_unit_testing
             - image_gen_testing
             - logging_testing
diff --git a/litellm/litellm_core_utils/llm_response_utils/response_metadata.py b/litellm/litellm_core_utils/llm_response_utils/response_metadata.py
new file mode 100644
index 0000000000..03595e27a4
--- /dev/null
+++ b/litellm/litellm_core_utils/llm_response_utils/response_metadata.py
@@ -0,0 +1,116 @@
+import datetime
+from typing import Any, Optional, Union
+
+from litellm.litellm_core_utils.core_helpers import process_response_headers
+from litellm.litellm_core_utils.llm_response_utils.get_api_base import get_api_base
+from litellm.litellm_core_utils.logging_utils import LiteLLMLoggingObject
+from litellm.types.utils import (
+    EmbeddingResponse,
+    HiddenParams,
+    ModelResponse,
+    TranscriptionResponse,
+)
+
+
+class ResponseMetadata:
+    """
+    Handles setting and managing `_hidden_params`, `response_time_ms`, and `litellm_overhead_time_ms` for LiteLLM responses
+    """
+
+    def __init__(self, result: Any):
+        self.result = result
+        self._hidden_params: Union[HiddenParams, dict] = (
+            getattr(result, "_hidden_params", {}) or {}
+        )
+
+    @property
+    def supports_response_time(self) -> bool:
+        """Check if response type supports timing metrics"""
+        return (
+            isinstance(self.result, ModelResponse)
+            or isinstance(self.result, EmbeddingResponse)
+            or isinstance(self.result, TranscriptionResponse)
+        )
+
+    def set_hidden_params(
+        self, logging_obj: LiteLLMLoggingObject, model: Optional[str], kwargs: dict
+    ) -> None:
+        """Set hidden parameters on the response"""
+        new_params = {
+            "litellm_call_id": getattr(logging_obj, "litellm_call_id", None),
+            "model_id": kwargs.get("model_info", {}).get("id", None),
+            "api_base": get_api_base(model=model or "", optional_params=kwargs),
+            "response_cost": logging_obj._response_cost_calculator(result=self.result),
+            "additional_headers": process_response_headers(
+                self._get_value_from_hidden_params("additional_headers") or {}
+            ),
+        }
+        self._update_hidden_params(new_params)
+
+    def _update_hidden_params(self, new_params: dict) -> None:
+        """
+        Update hidden params - handles when self._hidden_params is a dict or HiddenParams object
+        """
+        # Handle both dict and HiddenParams cases
+        if isinstance(self._hidden_params, dict):
+            self._hidden_params.update(new_params)
+        elif isinstance(self._hidden_params, HiddenParams):
+            # For HiddenParams object, set attributes individually
+            for key, value in new_params.items():
+                setattr(self._hidden_params, key, value)
+
+    def _get_value_from_hidden_params(self, key: str) -> Optional[Any]:
+        """Get value from hidden params - handles when self._hidden_params is a dict or HiddenParams object"""
+        if isinstance(self._hidden_params, dict):
+            return self._hidden_params.get(key, None)
+        elif isinstance(self._hidden_params, HiddenParams):
+            return getattr(self._hidden_params, key, None)
+
+    def set_timing_metrics(
+        self,
+        start_time: datetime.datetime,
+        end_time: datetime.datetime,
+        logging_obj: LiteLLMLoggingObject,
+    ) -> None:
+        """Set response timing metrics"""
+        total_response_time_ms = (end_time - start_time).total_seconds() * 1000
+
+        # Set total response time if supported
+        if self.supports_response_time:
+            self.result._response_ms = total_response_time_ms
+
+        # Calculate LiteLLM overhead
+        llm_api_duration_ms = logging_obj.model_call_details.get("llm_api_duration_ms")
+        if llm_api_duration_ms is not None:
+            overhead_ms = round(total_response_time_ms - llm_api_duration_ms, 4)
+            self._update_hidden_params(
+                {
+                    "litellm_overhead_time_ms": overhead_ms,
+                    "_response_ms": total_response_time_ms,
+                }
+            )
+
+    def apply(self) -> None:
+        """Apply metadata to the response object"""
+        if hasattr(self.result, "_hidden_params"):
+            self.result._hidden_params = self._hidden_params
+
+
+def update_response_metadata(
+    result: Any,
+    logging_obj: LiteLLMLoggingObject,
+    model: Optional[str],
+    kwargs: dict,
+    start_time: datetime.datetime,
+    end_time: datetime.datetime,
+) -> None:
+    """
+    Updates response metadata including hidden params and timing metrics
+    """
+    if result is None:
+        return
+
+    metadata = ResponseMetadata(result)
+    metadata.set_hidden_params(logging_obj, model, kwargs)
+    metadata.set_timing_metrics(start_time, end_time, logging_obj)
+    metadata.apply()
diff --git a/litellm/litellm_core_utils/logging_utils.py b/litellm/litellm_core_utils/logging_utils.py
index fb8689a522..41a91f9888 100644
--- a/litellm/litellm_core_utils/logging_utils.py
+++ b/litellm/litellm_core_utils/logging_utils.py
@@ -1,3 +1,5 @@
+import asyncio
+import functools
 from datetime import datetime
 from typing import TYPE_CHECKING, Any, List, Optional, Union
 
@@ -10,10 +12,14 @@ from litellm.types.utils import (
 
 if TYPE_CHECKING:
     from litellm import ModelResponse as _ModelResponse
+    from litellm.litellm_core_utils.litellm_logging import (
+        Logging as LiteLLMLoggingObject,
+    )
 
     LiteLLMModelResponse = _ModelResponse
 else:
     LiteLLMModelResponse = Any
+    LiteLLMLoggingObject = Any
 
 
 import litellm
@@ -91,3 +97,64 @@ def _assemble_complete_response_from_streaming_chunks(
     else:
         streaming_chunks.append(result)
     return complete_streaming_response
+
+
+def _set_duration_in_model_call_details(
+    logging_obj: Any,  # we're not guaranteed this will be `LiteLLMLoggingObject`
+    start_time: datetime,
+    end_time: datetime,
+):
+    """Helper to set duration in model_call_details, with error handling"""
+    try:
+        duration_ms = (end_time - start_time).total_seconds() * 1000
+        if logging_obj and hasattr(logging_obj, "model_call_details"):
+            logging_obj.model_call_details["llm_api_duration_ms"] = duration_ms
+        else:
+            verbose_logger.warning(
+                "`logging_obj` not found - unable to track `llm_api_duration_ms"
+            )
+    except Exception as e:
+        verbose_logger.warning(f"Error setting `llm_api_duration_ms`: {str(e)}")
+
+
+def track_llm_api_timing():
+    """
+    Decorator to track LLM API call timing for both sync and async functions.
+    The logging_obj is expected to be passed as an argument to the decorated function.
+    """
+
+    def decorator(func):
+        @functools.wraps(func)
+        async def async_wrapper(*args, **kwargs):
+            start_time = datetime.now()
+            try:
+                result = await func(*args, **kwargs)
+                return result
+            finally:
+                end_time = datetime.now()
+                _set_duration_in_model_call_details(
+                    logging_obj=kwargs.get("logging_obj", None),
+                    start_time=start_time,
+                    end_time=end_time,
+                )
+
+        @functools.wraps(func)
+        def sync_wrapper(*args, **kwargs):
+            start_time = datetime.now()
+            try:
+                result = func(*args, **kwargs)
+                return result
+            finally:
+                end_time = datetime.now()
+                _set_duration_in_model_call_details(
+                    logging_obj=kwargs.get("logging_obj", None),
+                    start_time=start_time,
+                    end_time=end_time,
+                )
+
+        # Check if the function is async or sync
+        if asyncio.iscoroutinefunction(func):
+            return async_wrapper
+        return sync_wrapper
+
+    return decorator
diff --git a/litellm/llms/bedrock/chat/converse_handler.py b/litellm/llms/bedrock/chat/converse_handler.py
index b6553f8bcc..5f50b78251 100644
--- a/litellm/llms/bedrock/chat/converse_handler.py
+++ b/litellm/llms/bedrock/chat/converse_handler.py
@@ -5,6 +5,7 @@ from typing import Any, Callable, Optional, Union
 import httpx
 
 import litellm
+from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObject
 from litellm.llms.custom_httpx.http_handler import (
     AsyncHTTPHandler,
     HTTPHandler,
@@ -26,7 +27,7 @@ def make_sync_call(
     data: str,
     model: str,
     messages: list,
-    logging_obj,
+    logging_obj: LiteLLMLoggingObject,
     json_mode: Optional[bool] = False,
     fake_stream: bool = False,
 ):
@@ -38,6 +39,7 @@ def make_sync_call(
         headers=headers,
         data=data,
         stream=not fake_stream,
+        logging_obj=logging_obj,
     )
 
     if response.status_code != 200:
@@ -171,7 +173,7 @@ class BedrockConverseLLM(BaseAWSLLM):
         print_verbose: Callable,
         timeout: Optional[Union[float, httpx.Timeout]],
         encoding,
-        logging_obj,
+        logging_obj: LiteLLMLoggingObject,
         stream,
         optional_params: dict,
         litellm_params: dict,
@@ -223,7 +225,9 @@ class BedrockConverseLLM(BaseAWSLLM):
             client = client  # type: ignore
 
         try:
-            response = await client.post(url=api_base, headers=headers, data=data)  # type: ignore
+            response = await client.post(
+                url=api_base, headers=headers, data=data, logging_obj=logging_obj
+            )  # type: ignore
             response.raise_for_status()
         except httpx.HTTPStatusError as err:
             error_code = err.response.status_code
@@ -254,7 +258,7 @@ class BedrockConverseLLM(BaseAWSLLM):
         model_response: ModelResponse,
         print_verbose: Callable,
         encoding,
-        logging_obj,
+        logging_obj: LiteLLMLoggingObject,
         optional_params: dict,
         acompletion: bool,
         timeout: Optional[Union[float, httpx.Timeout]],
@@ -458,7 +462,12 @@ class BedrockConverseLLM(BaseAWSLLM):
         ### COMPLETION
 
         try:
-            response = client.post(url=proxy_endpoint_url, headers=prepped.headers, data=data)  # type: ignore
+            response = client.post(
+                url=proxy_endpoint_url,
+                headers=prepped.headers,
+                data=data,
+                logging_obj=logging_obj,
+            )  # type: ignore
             response.raise_for_status()
         except httpx.HTTPStatusError as err:
             error_code = err.response.status_code
diff --git a/litellm/llms/bedrock/chat/invoke_handler.py b/litellm/llms/bedrock/chat/invoke_handler.py
index 8aa9a4db04..5ade1dc2dc 100644
--- a/litellm/llms/bedrock/chat/invoke_handler.py
+++ b/litellm/llms/bedrock/chat/invoke_handler.py
@@ -28,6 +28,7 @@ from litellm import verbose_logger
 from litellm.caching.caching import InMemoryCache
 from litellm.litellm_core_utils.core_helpers import map_finish_reason
 from litellm.litellm_core_utils.litellm_logging import Logging
+from litellm.litellm_core_utils.logging_utils import track_llm_api_timing
 from litellm.litellm_core_utils.prompt_templates.factory import (
     cohere_message_pt,
     construct_tool_use_system_prompt,
@@ -171,7 +172,7 @@ async def make_call(
     data: str,
     model: str,
     messages: list,
-    logging_obj,
+    logging_obj: Logging,
     fake_stream: bool = False,
     json_mode: Optional[bool] = False,
 ):
@@ -186,6 +187,7 @@ async def make_call(
             headers=headers,
             data=data,
             stream=not fake_stream,
+            logging_obj=logging_obj,
         )
 
         if response.status_code != 200:
@@ -577,7 +579,7 @@ class BedrockLLM(BaseAWSLLM):
         model_response: ModelResponse,
         print_verbose: Callable,
         encoding,
-        logging_obj,
+        logging_obj: Logging,
         optional_params: dict,
         acompletion: bool,
         timeout: Optional[Union[float, httpx.Timeout]],
@@ -890,6 +892,7 @@ class BedrockLLM(BaseAWSLLM):
                 headers=prepped.headers,  # type: ignore
                 data=data,
                 stream=stream,
+                logging_obj=logging_obj,
             )
 
             if response.status_code != 200:
@@ -917,7 +920,12 @@ class BedrockLLM(BaseAWSLLM):
             return streaming_response
 
         try:
-            response = self.client.post(url=proxy_endpoint_url, headers=prepped.headers, data=data)  # type: ignore
+            response = self.client.post(
+                url=proxy_endpoint_url,
+                headers=dict(prepped.headers),
+                data=data,
+                logging_obj=logging_obj,
+            )
             response.raise_for_status()
         except httpx.HTTPStatusError as err:
             error_code = err.response.status_code
@@ -949,7 +957,7 @@ class BedrockLLM(BaseAWSLLM):
         data: str,
         timeout: Optional[Union[float, httpx.Timeout]],
         encoding,
-        logging_obj,
+        logging_obj: Logging,
         stream,
         optional_params: dict,
         litellm_params=None,
@@ -968,7 +976,13 @@ class BedrockLLM(BaseAWSLLM):
             client = client  # type: ignore
 
         try:
-            response = await client.post(api_base, headers=headers, data=data)  # type: ignore
+            response = await client.post(
+                api_base,
+                headers=headers,
+                data=data,
+                timeout=timeout,
+                logging_obj=logging_obj,
+            )
             response.raise_for_status()
         except httpx.HTTPStatusError as err:
             error_code = err.response.status_code
@@ -990,6 +1004,7 @@ class BedrockLLM(BaseAWSLLM):
             encoding=encoding,
         )
 
+    @track_llm_api_timing()  # for streaming, we need to instrument the function calling the wrapper
     async def async_streaming(
         self,
         model: str,
@@ -1000,7 +1015,7 @@ class BedrockLLM(BaseAWSLLM):
         data: str,
         timeout: Optional[Union[float, httpx.Timeout]],
         encoding,
-        logging_obj,
+        logging_obj: Logging,
         stream,
         optional_params: dict,
         litellm_params=None,
diff --git a/litellm/llms/custom_httpx/http_handler.py b/litellm/llms/custom_httpx/http_handler.py
index c1be9c3baa..469bb693fb 100644
--- a/litellm/llms/custom_httpx/http_handler.py
+++ b/litellm/llms/custom_httpx/http_handler.py
@@ -6,12 +6,17 @@ import httpx
 from httpx import USE_CLIENT_DEFAULT, AsyncHTTPTransport, HTTPTransport
 
 import litellm
+from litellm.litellm_core_utils.logging_utils import track_llm_api_timing
 from litellm.types.llms.custom_http import *
 
 if TYPE_CHECKING:
     from litellm import LlmProviders
+    from litellm.litellm_core_utils.litellm_logging import (
+        Logging as LiteLLMLoggingObject,
+    )
 else:
     LlmProviders = Any
+    LiteLLMLoggingObject = Any
 
 try:
     from litellm._version import version
@@ -156,6 +161,7 @@ class AsyncHTTPHandler:
         )
         return response
 
+    @track_llm_api_timing()
     async def post(
         self,
         url: str,
@@ -165,6 +171,7 @@ class AsyncHTTPHandler:
         headers: Optional[dict] = None,
         timeout: Optional[Union[float, httpx.Timeout]] = None,
         stream: bool = False,
+        logging_obj: Optional[LiteLLMLoggingObject] = None,
     ):
         try:
             if timeout is None:
@@ -494,6 +501,7 @@ class HTTPHandler:
         timeout: Optional[Union[float, httpx.Timeout]] = None,
         files: Optional[dict] = None,
         content: Any = None,
+        logging_obj: Optional[LiteLLMLoggingObject] = None,
     ):
         try:
             if timeout is not None:
diff --git a/litellm/llms/openai/openai.py b/litellm/llms/openai/openai.py
index 4d3a76ad5c..aa361422fe 100644
--- a/litellm/llms/openai/openai.py
+++ b/litellm/llms/openai/openai.py
@@ -27,6 +27,7 @@ import litellm
 from litellm import LlmProviders
 from litellm._logging import verbose_logger
 from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
+from litellm.litellm_core_utils.logging_utils import track_llm_api_timing
 from litellm.llms.base_llm.base_model_iterator import BaseModelResponseIterator
 from litellm.llms.base_llm.chat.transformation import BaseConfig, BaseLLMException
 from litellm.llms.bedrock.chat.invoke_handler import MockResponseIterator
@@ -380,11 +381,13 @@ class OpenAIChatCompletion(BaseLLM):
         else:
             return client
 
+    @track_llm_api_timing()
     async def make_openai_chat_completion_request(
         self,
         openai_aclient: AsyncOpenAI,
         data: dict,
         timeout: Union[float, httpx.Timeout],
+        logging_obj: LiteLLMLoggingObj,
     ) -> Tuple[dict, BaseModel]:
         """
         Helper to:
@@ -414,11 +417,13 @@ class OpenAIChatCompletion(BaseLLM):
         except Exception as e:
             raise e
 
+    @track_llm_api_timing()
     def make_sync_openai_chat_completion_request(
         self,
         openai_client: OpenAI,
         data: dict,
         timeout: Union[float, httpx.Timeout],
+        logging_obj: LiteLLMLoggingObj,
     ) -> Tuple[dict, BaseModel]:
         """
         Helper to:
@@ -630,6 +635,7 @@ class OpenAIChatCompletion(BaseLLM):
                                 openai_client=openai_client,
                                 data=data,
                                 timeout=timeout,
+                                logging_obj=logging_obj,
                             )
                         )
 
@@ -762,7 +768,10 @@ class OpenAIChatCompletion(BaseLLM):
                 )
 
                 headers, response = await self.make_openai_chat_completion_request(
-                    openai_aclient=openai_aclient, data=data, timeout=timeout
+                    openai_aclient=openai_aclient,
+                    data=data,
+                    timeout=timeout,
+                    logging_obj=logging_obj,
                 )
                 stringified_response = response.model_dump()
 
@@ -852,6 +861,7 @@ class OpenAIChatCompletion(BaseLLM):
             openai_client=openai_client,
             data=data,
             timeout=timeout,
+            logging_obj=logging_obj,
         )
 
         logging_obj.model_call_details["response_headers"] = headers
@@ -910,7 +920,10 @@ class OpenAIChatCompletion(BaseLLM):
                 )
 
                 headers, response = await self.make_openai_chat_completion_request(
-                    openai_aclient=openai_aclient, data=data, timeout=timeout
+                    openai_aclient=openai_aclient,
+                    data=data,
+                    timeout=timeout,
+                    logging_obj=logging_obj,
                 )
                 logging_obj.model_call_details["response_headers"] = headers
                 streamwrapper = CustomStreamWrapper(
@@ -965,11 +978,13 @@ class OpenAIChatCompletion(BaseLLM):
                         )
 
     # Embedding
+    @track_llm_api_timing()
     async def make_openai_embedding_request(
         self,
         openai_aclient: AsyncOpenAI,
         data: dict,
         timeout: Union[float, httpx.Timeout],
+        logging_obj: LiteLLMLoggingObj,
     ):
         """
         Helper to:
@@ -986,11 +1001,13 @@ class OpenAIChatCompletion(BaseLLM):
         except Exception as e:
             raise e
 
+    @track_llm_api_timing()
     def make_sync_openai_embedding_request(
         self,
         openai_client: OpenAI,
         data: dict,
         timeout: Union[float, httpx.Timeout],
+        logging_obj: LiteLLMLoggingObj,
     ):
         """
         Helper to:
@@ -1030,7 +1047,10 @@ class OpenAIChatCompletion(BaseLLM):
                 client=client,
             )
             headers, response = await self.make_openai_embedding_request(
-                openai_aclient=openai_aclient, data=data, timeout=timeout
+                openai_aclient=openai_aclient,
+                data=data,
+                timeout=timeout,
+                logging_obj=logging_obj,
             )
             logging_obj.model_call_details["response_headers"] = headers
             stringified_response = response.model_dump()
@@ -1128,7 +1148,10 @@ class OpenAIChatCompletion(BaseLLM):
             ## embedding CALL
             headers: Optional[Dict] = None
             headers, sync_embedding_response = self.make_sync_openai_embedding_request(
-                openai_client=openai_client, data=data, timeout=timeout
+                openai_client=openai_client,
+                data=data,
+                timeout=timeout,
+                logging_obj=logging_obj,
             )  # type: ignore
 
             ## LOGGING
diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py
index bf975ebdac..2beb1bd435 100644
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@@ -733,11 +733,13 @@ def get_custom_headers(
     version: Optional[str] = None,
     model_region: Optional[str] = None,
     response_cost: Optional[Union[float, str]] = None,
+    hidden_params: Optional[dict] = None,
     fastest_response_batch_completion: Optional[bool] = None,
     request_data: Optional[dict] = {},
     **kwargs,
 ) -> dict:
     exclude_values = {"", None}
+    hidden_params = hidden_params or {}
     headers = {
         "x-litellm-call-id": call_id,
         "x-litellm-model-id": model_id,
@@ -750,6 +752,10 @@ def get_custom_headers(
         "x-litellm-key-rpm-limit": str(user_api_key_dict.rpm_limit),
         "x-litellm-key-max-budget": str(user_api_key_dict.max_budget),
         "x-litellm-key-spend": str(user_api_key_dict.spend),
+        "x-litellm-response-duration-ms": str(hidden_params.get("_response_ms", None)),
+        "x-litellm-overhead-duration-ms": str(
+            hidden_params.get("litellm_overhead_time_ms", None)
+        ),
         "x-litellm-fastest_response_batch_completion": (
             str(fastest_response_batch_completion)
             if fastest_response_batch_completion is not None
@@ -3491,6 +3497,7 @@ async def chat_completion(  # noqa: PLR0915
                 model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
                 fastest_response_batch_completion=fastest_response_batch_completion,
                 request_data=data,
+                hidden_params=hidden_params,
                 **additional_headers,
             )
             selected_data_generator = select_data_generator(
@@ -3526,6 +3533,7 @@ async def chat_completion(  # noqa: PLR0915
                 model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
                 fastest_response_batch_completion=fastest_response_batch_completion,
                 request_data=data,
+                hidden_params=hidden_params,
                 **additional_headers,
             )
         )
@@ -3719,6 +3727,7 @@ async def completion(  # noqa: PLR0915
                 api_base=api_base,
                 version=version,
                 response_cost=response_cost,
+                hidden_params=hidden_params,
                 request_data=data,
             )
             selected_data_generator = select_data_generator(
@@ -3747,6 +3756,7 @@ async def completion(  # noqa: PLR0915
                 version=version,
                 response_cost=response_cost,
                 request_data=data,
+                hidden_params=hidden_params,
             )
         )
         await check_response_size_is_safe(response=response)
@@ -3977,6 +3987,7 @@ async def embeddings(  # noqa: PLR0915
                 model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
                 call_id=litellm_call_id,
                 request_data=data,
+                hidden_params=hidden_params,
                 **additional_headers,
             )
         )
@@ -4103,6 +4114,7 @@ async def image_generation(
                 model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
                 call_id=litellm_call_id,
                 request_data=data,
+                hidden_params=hidden_params,
             )
         )
 
@@ -4223,6 +4235,7 @@ async def audio_speech(
             fastest_response_batch_completion=None,
             call_id=litellm_call_id,
             request_data=data,
+            hidden_params=hidden_params,
         )
 
         select_data_generator(
@@ -4362,6 +4375,7 @@ async def audio_transcriptions(
                 model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
                 call_id=litellm_call_id,
                 request_data=data,
+                hidden_params=hidden_params,
                 **additional_headers,
             )
         )
@@ -4510,6 +4524,7 @@ async def get_assistants(
                 version=version,
                 model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
                 request_data=data,
+                hidden_params=hidden_params,
             )
         )
 
@@ -4607,6 +4622,7 @@ async def create_assistant(
                 version=version,
                 model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
                 request_data=data,
+                hidden_params=hidden_params,
             )
         )
 
@@ -4703,6 +4719,7 @@ async def delete_assistant(
                 version=version,
                 model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
                 request_data=data,
+                hidden_params=hidden_params,
             )
         )
 
@@ -4799,6 +4816,7 @@ async def create_threads(
                 version=version,
                 model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
                 request_data=data,
+                hidden_params=hidden_params,
             )
         )
 
@@ -4894,6 +4912,7 @@ async def get_thread(
                 version=version,
                 model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
                 request_data=data,
+                hidden_params=hidden_params,
             )
         )
 
@@ -4992,6 +5011,7 @@ async def add_messages(
                 version=version,
                 model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
                 request_data=data,
+                hidden_params=hidden_params,
             )
         )
 
@@ -5086,6 +5106,7 @@ async def get_messages(
                 version=version,
                 model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
                 request_data=data,
+                hidden_params=hidden_params,
             )
         )
 
@@ -5194,6 +5215,7 @@ async def run_thread(
                 version=version,
                 model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
                 request_data=data,
+                hidden_params=hidden_params,
             )
         )
 
@@ -5316,6 +5338,7 @@ async def moderations(
                 version=version,
                 model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
                 request_data=data,
+                hidden_params=hidden_params,
             )
         )
 
@@ -5488,6 +5511,7 @@ async def anthropic_response(  # noqa: PLR0915
                 version=version,
                 response_cost=response_cost,
                 request_data=data,
+                hidden_params=hidden_params,
             )
         )
 
diff --git a/litellm/utils.py b/litellm/utils.py
index c6ae545e6e..298c2652cb 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -93,6 +93,9 @@ from litellm.litellm_core_utils.llm_response_utils.get_formatted_prompt import (
 from litellm.litellm_core_utils.llm_response_utils.get_headers import (
     get_response_headers,
 )
+from litellm.litellm_core_utils.llm_response_utils.response_metadata import (
+    ResponseMetadata,
+)
 from litellm.litellm_core_utils.redact_messages import (
     LiteLLMLoggingObject,
     redact_message_input_output_from_logging,
@@ -929,6 +932,15 @@ def client(original_function):  # noqa: PLR0915
                         chunks, messages=kwargs.get("messages", None)
                     )
                 else:
+                    # RETURN RESULT
+                    update_response_metadata(
+                        result=result,
+                        logging_obj=logging_obj,
+                        model=model,
+                        kwargs=kwargs,
+                        start_time=start_time,
+                        end_time=end_time,
+                    )
                     return result
             elif "acompletion" in kwargs and kwargs["acompletion"] is True:
                 return result
@@ -966,25 +978,14 @@ def client(original_function):  # noqa: PLR0915
                 end_time,
             )
             # RETURN RESULT
-            if hasattr(result, "_hidden_params"):
-                result._hidden_params["model_id"] = kwargs.get("model_info", {}).get(
-                    "id", None
-                )
-                result._hidden_params["api_base"] = get_api_base(
-                    model=model or "",
-                    optional_params=getattr(logging_obj, "optional_params", {}),
-                )
-                result._hidden_params["response_cost"] = (
-                    logging_obj._response_cost_calculator(result=result)
-                )
-
-                result._hidden_params["additional_headers"] = process_response_headers(
-                    result._hidden_params.get("additional_headers") or {}
-                )  # GUARANTEE OPENAI HEADERS IN RESPONSE
-            if result is not None:
-                result._response_ms = (
-                    end_time - start_time
-                ).total_seconds() * 1000  # return response latency in ms like openai
+            update_response_metadata(
+                result=result,
+                logging_obj=logging_obj,
+                model=model,
+                kwargs=kwargs,
+                start_time=start_time,
+                end_time=end_time,
+            )
             return result
         except Exception as e:
             call_type = original_function.__name__
@@ -1116,39 +1117,17 @@ def client(original_function):  # noqa: PLR0915
                         chunks, messages=kwargs.get("messages", None)
                     )
                 else:
+                    update_response_metadata(
+                        result=result,
+                        logging_obj=logging_obj,
+                        model=model,
+                        kwargs=kwargs,
+                        start_time=start_time,
+                        end_time=end_time,
+                    )
                     return result
             elif call_type == CallTypes.arealtime.value:
                 return result
-
-            # ADD HIDDEN PARAMS - additional call metadata
-            if hasattr(result, "_hidden_params"):
-                result._hidden_params["litellm_call_id"] = getattr(
-                    logging_obj, "litellm_call_id", None
-                )
-                result._hidden_params["model_id"] = kwargs.get("model_info", {}).get(
-                    "id", None
-                )
-                result._hidden_params["api_base"] = get_api_base(
-                    model=model or "",
-                    optional_params=kwargs,
-                )
-                result._hidden_params["response_cost"] = (
-                    logging_obj._response_cost_calculator(result=result)
-                )
-                result._hidden_params["additional_headers"] = process_response_headers(
-                    result._hidden_params.get("additional_headers") or {}
-                )  # GUARANTEE OPENAI HEADERS IN RESPONSE
-            if (
-                isinstance(result, ModelResponse)
-                or isinstance(result, EmbeddingResponse)
-                or isinstance(result, TranscriptionResponse)
-            ):
-                setattr(
-                    result,
-                    "_response_ms",
-                    (end_time - start_time).total_seconds() * 1000,
-                )  # return response latency in ms like openai
-
             ### POST-CALL RULES ###
             post_call_processing(
                 original_response=result, model=model, optional_params=kwargs
@@ -1190,6 +1169,15 @@ def client(original_function):  # noqa: PLR0915
                     end_time=end_time,
                 )
 
+            update_response_metadata(
+                result=result,
+                logging_obj=logging_obj,
+                model=model,
+                kwargs=kwargs,
+                start_time=start_time,
+                end_time=end_time,
+            )
+
             return result
         except Exception as e:
             traceback_exception = traceback.format_exc()
@@ -1293,6 +1281,31 @@ def _is_async_request(
     return False
 
 
+def update_response_metadata(
+    result: Any,
+    logging_obj: LiteLLMLoggingObject,
+    model: Optional[str],
+    kwargs: dict,
+    start_time: datetime.datetime,
+    end_time: datetime.datetime,
+) -> None:
+    """
+    Updates response metadata, adds the following:
+        - response._hidden_params
+        - response._hidden_params["litellm_overhead_time_ms"]
+        - response.response_time_ms
+    """
+    if result is None:
+        return
+
+    metadata = ResponseMetadata(result)
+    metadata.set_hidden_params(logging_obj=logging_obj, model=model, kwargs=kwargs)
+    metadata.set_timing_metrics(
+        start_time=start_time, end_time=end_time, logging_obj=logging_obj
+    )
+    metadata.apply()
+
+
 def _select_tokenizer(
     model: str, custom_tokenizer: Optional[CustomHuggingfaceTokenizer] = None
 ):
diff --git a/tests/secret_manager_tests/conftest.py b/tests/litellm_utils_tests/conftest.py
similarity index 100%
rename from tests/secret_manager_tests/conftest.py
rename to tests/litellm_utils_tests/conftest.py
diff --git a/tests/secret_manager_tests/test_aws_secret_manager.py b/tests/litellm_utils_tests/test_aws_secret_manager.py
similarity index 100%
rename from tests/secret_manager_tests/test_aws_secret_manager.py
rename to tests/litellm_utils_tests/test_aws_secret_manager.py
diff --git a/tests/secret_manager_tests/test_get_secret.py b/tests/litellm_utils_tests/test_get_secret.py
similarity index 100%
rename from tests/secret_manager_tests/test_get_secret.py
rename to tests/litellm_utils_tests/test_get_secret.py
diff --git a/tests/secret_manager_tests/test_hashicorp.py b/tests/litellm_utils_tests/test_hashicorp.py
similarity index 100%
rename from tests/secret_manager_tests/test_hashicorp.py
rename to tests/litellm_utils_tests/test_hashicorp.py
diff --git a/tests/litellm_utils_tests/test_litellm_overhead.py b/tests/litellm_utils_tests/test_litellm_overhead.py
new file mode 100644
index 0000000000..8d0bdf313d
--- /dev/null
+++ b/tests/litellm_utils_tests/test_litellm_overhead.py
@@ -0,0 +1,116 @@
+import json
+import os
+import sys
+import time
+from datetime import datetime
+from unittest.mock import AsyncMock, patch, MagicMock
+import pytest
+
+sys.path.insert(
+    0, os.path.abspath("../..")
+)  # Adds the parent directory to the system path
+import litellm
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model",
+    [
+        "bedrock/mistral.mistral-7b-instruct-v0:2",
+        "openai/gpt-4o",
+        "openai/self_hosted",
+        "bedrock/anthropic.claude-3-5-haiku-20241022-v1:0",
+    ],
+)
+async def test_litellm_overhead(model):
+
+    litellm._turn_on_debug()
+    start_time = datetime.now()
+    if model == "openai/self_hosted":
+        response = await litellm.acompletion(
+            model=model,
+            messages=[{"role": "user", "content": "Hello, world!"}],
+            api_base="https://exampleopenaiendpoint-production.up.railway.app/",
+        )
+    else:
+        response = await litellm.acompletion(
+            model=model,
+            messages=[{"role": "user", "content": "Hello, world!"}],
+        )
+    end_time = datetime.now()
+    total_time_ms = (end_time - start_time).total_seconds() * 1000
+    print(response)
+    print(response._hidden_params)
+    litellm_overhead_ms = response._hidden_params["litellm_overhead_time_ms"]
+    # calculate percent of overhead caused by litellm
+    overhead_percent = litellm_overhead_ms * 100 / total_time_ms
+    print("##########################\n")
+    print("total_time_ms", total_time_ms)
+    print("response litellm_overhead_ms", litellm_overhead_ms)
+    print("litellm overhead_percent {}%".format(overhead_percent))
+    print("##########################\n")
+    assert litellm_overhead_ms > 0
+    assert litellm_overhead_ms < 1000
+
+    # latency overhead should be less than total request time
+    assert litellm_overhead_ms < (end_time - start_time).total_seconds() * 1000
+
+    # latency overhead should be under 40% of total request time
+    assert overhead_percent < 40
+
+    pass
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model",
+    [
+        "bedrock/mistral.mistral-7b-instruct-v0:2",
+        "openai/gpt-4o",
+        "bedrock/anthropic.claude-3-5-haiku-20241022-v1:0",
+        "openai/self_hosted",
+    ],
+)
+async def test_litellm_overhead_stream(model):
+
+    litellm._turn_on_debug()
+    start_time = datetime.now()
+    if model == "openai/self_hosted":
+        response = await litellm.acompletion(
+            model=model,
+            messages=[{"role": "user", "content": "Hello, world!"}],
+            api_base="https://exampleopenaiendpoint-production.up.railway.app/",
+            stream=True,
+        )
+    else:
+        response = await litellm.acompletion(
+            model=model,
+            messages=[{"role": "user", "content": "Hello, world!"}],
+            stream=True,
+        )
+
+    async for chunk in response:
+        print()
+
+    end_time = datetime.now()
+    total_time_ms = (end_time - start_time).total_seconds() * 1000
+    print(response)
+    print(response._hidden_params)
+    litellm_overhead_ms = response._hidden_params["litellm_overhead_time_ms"]
+    # calculate percent of overhead caused by litellm
+    overhead_percent = litellm_overhead_ms * 100 / total_time_ms
+    print("##########################\n")
+    print("total_time_ms", total_time_ms)
+    print("response litellm_overhead_ms", litellm_overhead_ms)
+    print("litellm overhead_percent {}%".format(overhead_percent))
+    print("##########################\n")
+    assert litellm_overhead_ms > 0
+    assert litellm_overhead_ms < 1000
+
+    # latency overhead should be less than total request time
+    assert litellm_overhead_ms < (end_time - start_time).total_seconds() * 1000
+
+    # latency overhead should be under 40% of total request time
+    assert overhead_percent < 40
+
+    pass
diff --git a/tests/secret_manager_tests/test_secret_manager.py b/tests/litellm_utils_tests/test_secret_manager.py
similarity index 100%
rename from tests/secret_manager_tests/test_secret_manager.py
rename to tests/litellm_utils_tests/test_secret_manager.py
diff --git a/tests/secret_manager_tests/vertex_key.json b/tests/litellm_utils_tests/vertex_key.json
similarity index 100%
rename from tests/secret_manager_tests/vertex_key.json
rename to tests/litellm_utils_tests/vertex_key.json
diff --git a/tests/local_testing/test_amazing_s3_logs.py b/tests/logging_callback_tests/test_amazing_s3_logs.py
similarity index 100%
rename from tests/local_testing/test_amazing_s3_logs.py
rename to tests/logging_callback_tests/test_amazing_s3_logs.py