diff --git a/docs/my-website/docs/enterprise.md b/docs/my-website/docs/enterprise.md
index e3758266a..5bd09ec15 100644
--- a/docs/my-website/docs/enterprise.md
+++ b/docs/my-website/docs/enterprise.md
@@ -20,6 +20,8 @@ This covers:
     - **Spend Tracking**
         - ✅ [Tracking Spend for Custom Tags](./proxy/enterprise#tracking-spend-for-custom-tags)
         - ✅ [API Endpoints to get Spend Reports per Team, API Key, Customer](./proxy/cost_tracking.md#✨-enterprise-api-endpoints-to-get-spend)
+    - **Advanced Metrics**
+        - ✅ [`x-ratelimit-remaining-requests`, `x-ratelimit-remaining-tokens` for LLM APIs on Prometheus](./proxy/prometheus#✨-enterprise-llm-remaining-requests-and-remaining-tokens)
     - **Guardrails, PII Masking, Content Moderation**
         - ✅ [Content Moderation with LLM Guard, LlamaGuard, Secret Detection, Google Text Moderations](./proxy/enterprise#content-moderation)
         - ✅ [Prompt Injection Detection (with LakeraAI API)](./proxy/enterprise#prompt-injection-detection---lakeraai)
diff --git a/docs/my-website/docs/proxy/enterprise.md b/docs/my-website/docs/proxy/enterprise.md
index e061a917e..5dabba5ed 100644
--- a/docs/my-website/docs/proxy/enterprise.md
+++ b/docs/my-website/docs/proxy/enterprise.md
@@ -23,6 +23,8 @@ Features:
 - **Spend Tracking**
     - ✅ [Tracking Spend for Custom Tags](#tracking-spend-for-custom-tags)
     - ✅ [API Endpoints to get Spend Reports per Team, API Key, Customer](cost_tracking.md#✨-enterprise-api-endpoints-to-get-spend)
+- **Advanced Metrics**
+    - ✅ [`x-ratelimit-remaining-requests`, `x-ratelimit-remaining-tokens` for LLM APIs on Prometheus](prometheus#✨-enterprise-llm-remaining-requests-and-remaining-tokens)
 - **Guardrails, PII Masking, Content Moderation**
     - ✅ [Content Moderation with LLM Guard, LlamaGuard, Secret Detection, Google Text Moderations](#content-moderation)
     - ✅ [Prompt Injection Detection (with LakeraAI API)](#prompt-injection-detection---lakeraai)
diff --git a/docs/my-website/docs/proxy/logging.md b/docs/my-website/docs/proxy/logging.md
index f9ed5db3d..83bf8ee95 100644
--- a/docs/my-website/docs/proxy/logging.md
+++ b/docs/my-website/docs/proxy/logging.md
@@ -1188,6 +1188,7 @@ litellm_settings:
     s3_region_name: us-west-2              # AWS Region Name for S3
     s3_aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID  # us os.environ/<variable name> to pass environment variables. This is AWS Access Key ID for S3
     s3_aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY  # AWS Secret Access Key for S3
+    s3_path: my-test-path # [OPTIONAL] set path in bucket you want to write logs to
     s3_endpoint_url: https://s3.amazonaws.com  # [OPTIONAL] S3 endpoint URL, if you want to use Backblaze/cloudflare s3 buckets
 ```
 
diff --git a/docs/my-website/docs/proxy/prometheus.md b/docs/my-website/docs/proxy/prometheus.md
index 2c7481f4c..6790b25b0 100644
--- a/docs/my-website/docs/proxy/prometheus.md
+++ b/docs/my-website/docs/proxy/prometheus.md
@@ -1,3 +1,6 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
 # 📈 Prometheus metrics [BETA]
 
 LiteLLM Exposes a `/metrics` endpoint for Prometheus to Poll
@@ -61,6 +64,56 @@ http://localhost:4000/metrics
 | `litellm_remaining_api_key_budget_metric`                | Remaining Budget for API Key (A key Created on LiteLLM)|
 
 
+### ✨ (Enterprise) LLM Remaining Requests and Remaining Tokens
+Set this on your config.yaml to allow you to track how close you are to hitting your TPM / RPM limits on each model group 
+
+```yaml
+litellm_settings:
+  success_callback: ["prometheus"]
+  failure_callback: ["prometheus"]
+  return_response_headers: true # ensures the LLM API calls track the response headers
+```
+
+| Metric Name          | Description                          |
+|----------------------|--------------------------------------|
+| `litellm_remaining_requests_metric`             | Track `x-ratelimit-remaining-requests` returned from LLM API Deployment |
+| `litellm_remaining_tokens`                | Track `x-ratelimit-remaining-tokens` return from LLM API Deployment |
+
+Example Metric
+<Tabs>
+
+<TabItem value="Remaining Requests" label="Remaining Requests">
+
+```shell
+litellm_remaining_requests
+{
+  api_base="https://api.openai.com/v1",
+  api_provider="openai",
+  litellm_model_name="gpt-3.5-turbo",
+  model_group="gpt-3.5-turbo"
+} 
+8998.0
+```
+
+</TabItem>
+
+<TabItem value="Requests" label="Remaining Tokens">
+
+```shell
+litellm_remaining_tokens
+{
+  api_base="https://api.openai.com/v1",
+  api_provider="openai",
+  litellm_model_name="gpt-3.5-turbo",
+  model_group="gpt-3.5-turbo"
+} 
+999981.0
+```
+
+</TabItem>
+
+</Tabs>
+
 ## Monitor System Health
 
 To monitor the health of litellm adjacent services (redis / postgres), do:
diff --git a/litellm/__init__.py b/litellm/__init__.py
index 0fa822a98..a9e6b69ae 100644
--- a/litellm/__init__.py
+++ b/litellm/__init__.py
@@ -125,6 +125,9 @@ llm_guard_mode: Literal["all", "key-specific", "request-specific"] = "all"
 ##################
 ### PREVIEW FEATURES ###
 enable_preview_features: bool = False
+return_response_headers: bool = (
+    False  # get response headers from LLM Api providers - example x-remaining-requests,
+)
 ##################
 logging: bool = True
 caching: bool = (
diff --git a/litellm/integrations/prometheus.py b/litellm/integrations/prometheus.py
index 4f0ffa387..6cd746907 100644
--- a/litellm/integrations/prometheus.py
+++ b/litellm/integrations/prometheus.py
@@ -2,14 +2,20 @@
 #### What this does ####
 #    On success, log events to Prometheus
 
-import dotenv, os
-import requests  # type: ignore
+import datetime
+import os
+import subprocess
+import sys
 import traceback
-import datetime, subprocess, sys
-import litellm, uuid
-from litellm._logging import print_verbose, verbose_logger
+import uuid
 from typing import Optional, Union
 
+import dotenv
+import requests  # type: ignore
+
+import litellm
+from litellm._logging import print_verbose, verbose_logger
+
 
 class PrometheusLogger:
     # Class variables or attributes
@@ -20,6 +26,8 @@ class PrometheusLogger:
         try:
             from prometheus_client import Counter, Gauge
 
+            from litellm.proxy.proxy_server import premium_user
+
             self.litellm_llm_api_failed_requests_metric = Counter(
                 name="litellm_llm_api_failed_requests_metric",
                 documentation="Total number of failed LLM API calls via litellm",
@@ -88,6 +96,31 @@ class PrometheusLogger:
                 labelnames=["hashed_api_key", "api_key_alias"],
             )
 
+            # Litellm-Enterprise Metrics
+            if premium_user is True:
+                # Remaining Rate Limit for model
+                self.litellm_remaining_requests_metric = Gauge(
+                    "litellm_remaining_requests",
+                    "remaining requests for model, returned from LLM API Provider",
+                    labelnames=[
+                        "model_group",
+                        "api_provider",
+                        "api_base",
+                        "litellm_model_name",
+                    ],
+                )
+
+                self.litellm_remaining_tokens_metric = Gauge(
+                    "litellm_remaining_tokens",
+                    "remaining tokens for model, returned from LLM API Provider",
+                    labelnames=[
+                        "model_group",
+                        "api_provider",
+                        "api_base",
+                        "litellm_model_name",
+                    ],
+                )
+
         except Exception as e:
             print_verbose(f"Got exception on init prometheus client {str(e)}")
             raise e
@@ -104,6 +137,8 @@ class PrometheusLogger:
     ):
         try:
             # Define prometheus client
+            from litellm.proxy.proxy_server import premium_user
+
             verbose_logger.debug(
                 f"prometheus Logging - Enters logging function for model {kwargs}"
             )
@@ -199,6 +234,10 @@ class PrometheusLogger:
                 user_api_key, user_api_key_alias
             ).set(_remaining_api_key_budget)
 
+            # set x-ratelimit headers
+            if premium_user is True:
+                self.set_remaining_tokens_requests_metric(kwargs)
+
             ### FAILURE INCREMENT ###
             if "exception" in kwargs:
                 self.litellm_llm_api_failed_requests_metric.labels(
@@ -216,6 +255,58 @@ class PrometheusLogger:
             verbose_logger.debug(traceback.format_exc())
             pass
 
+    def set_remaining_tokens_requests_metric(self, request_kwargs: dict):
+        try:
+            verbose_logger.debug("setting remaining tokens requests metric")
+            _response_headers = request_kwargs.get("response_headers")
+            _litellm_params = request_kwargs.get("litellm_params", {}) or {}
+            _metadata = _litellm_params.get("metadata", {})
+            litellm_model_name = request_kwargs.get("model", None)
+            model_group = _metadata.get("model_group", None)
+            api_base = _metadata.get("api_base", None)
+            llm_provider = _litellm_params.get("custom_llm_provider", None)
+
+            remaining_requests = None
+            remaining_tokens = None
+            # OpenAI / OpenAI Compatible headers
+            if (
+                _response_headers
+                and "x-ratelimit-remaining-requests" in _response_headers
+            ):
+                remaining_requests = _response_headers["x-ratelimit-remaining-requests"]
+            if (
+                _response_headers
+                and "x-ratelimit-remaining-tokens" in _response_headers
+            ):
+                remaining_tokens = _response_headers["x-ratelimit-remaining-tokens"]
+            verbose_logger.debug(
+                f"remaining requests: {remaining_requests}, remaining tokens: {remaining_tokens}"
+            )
+
+            if remaining_requests:
+                """
+                "model_group",
+                "api_provider",
+                "api_base",
+                "litellm_model_name"
+                """
+                self.litellm_remaining_requests_metric.labels(
+                    model_group, llm_provider, api_base, litellm_model_name
+                ).set(remaining_requests)
+
+            if remaining_tokens:
+                self.litellm_remaining_tokens_metric.labels(
+                    model_group, llm_provider, api_base, litellm_model_name
+                ).set(remaining_tokens)
+
+        except Exception as e:
+            verbose_logger.error(
+                "Prometheus Error: set_remaining_tokens_requests_metric. Exception occured - {}".format(
+                    str(e)
+                )
+            )
+            return
+
 
 def safe_get_remaining_budget(
     max_budget: Optional[float], spend: Optional[float]
diff --git a/litellm/integrations/s3.py b/litellm/integrations/s3.py
index 0796d1048..6e8c4a4e4 100644
--- a/litellm/integrations/s3.py
+++ b/litellm/integrations/s3.py
@@ -1,10 +1,14 @@
 #### What this does ####
 #    On success + failure, log events to Supabase
 
+import datetime
 import os
+import subprocess
+import sys
 import traceback
-import datetime, subprocess, sys
-import litellm, uuid
+import uuid
+
+import litellm
 from litellm._logging import print_verbose, verbose_logger
 
 
@@ -54,6 +58,7 @@ class S3Logger:
                     "s3_aws_session_token"
                 )
                 s3_config = litellm.s3_callback_params.get("s3_config")
+                s3_path = litellm.s3_callback_params.get("s3_path")
                 # done reading litellm.s3_callback_params
 
             self.bucket_name = s3_bucket_name
diff --git a/litellm/llms/azure.py b/litellm/llms/azure.py
index e127ecea6..000feed44 100644
--- a/litellm/llms/azure.py
+++ b/litellm/llms/azure.py
@@ -23,6 +23,7 @@ from typing_extensions import overload
 import litellm
 from litellm import OpenAIConfig
 from litellm.caching import DualCache
+from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
 from litellm.utils import (
     Choices,
     CustomStreamWrapper,
@@ -458,6 +459,36 @@ class AzureChatCompletion(BaseLLM):
 
         return azure_client
 
+    async def make_azure_openai_chat_completion_request(
+        self,
+        azure_client: AsyncAzureOpenAI,
+        data: dict,
+        timeout: Union[float, httpx.Timeout],
+    ):
+        """
+        Helper to:
+        - call chat.completions.create.with_raw_response when litellm.return_response_headers is True
+        - call chat.completions.create by default
+        """
+        try:
+            if litellm.return_response_headers is True:
+                raw_response = (
+                    await azure_client.chat.completions.with_raw_response.create(
+                        **data, timeout=timeout
+                    )
+                )
+
+                headers = dict(raw_response.headers)
+                response = raw_response.parse()
+                return headers, response
+            else:
+                response = await azure_client.chat.completions.create(
+                    **data, timeout=timeout
+                )
+                return None, response
+        except Exception as e:
+            raise e
+
     def completion(
         self,
         model: str,
@@ -470,7 +501,7 @@ class AzureChatCompletion(BaseLLM):
         azure_ad_token: str,
         print_verbose: Callable,
         timeout: Union[float, httpx.Timeout],
-        logging_obj,
+        logging_obj: LiteLLMLoggingObj,
         optional_params,
         litellm_params,
         logger_fn,
@@ -649,9 +680,9 @@ class AzureChatCompletion(BaseLLM):
         data: dict,
         timeout: Any,
         model_response: ModelResponse,
+        logging_obj: LiteLLMLoggingObj,
         azure_ad_token: Optional[str] = None,
         client=None,  # this is the AsyncAzureOpenAI
-        logging_obj=None,
     ):
         response = None
         try:
@@ -701,9 +732,13 @@ class AzureChatCompletion(BaseLLM):
                     "complete_input_dict": data,
                 },
             )
-            response = await azure_client.chat.completions.create(
-                **data, timeout=timeout
+
+            headers, response = await self.make_azure_openai_chat_completion_request(
+                azure_client=azure_client,
+                data=data,
+                timeout=timeout,
             )
+            logging_obj.model_call_details["response_headers"] = headers
 
             stringified_response = response.model_dump()
             logging_obj.post_call(
@@ -812,7 +847,7 @@ class AzureChatCompletion(BaseLLM):
 
     async def async_streaming(
         self,
-        logging_obj,
+        logging_obj: LiteLLMLoggingObj,
         api_base: str,
         api_key: str,
         api_version: str,
@@ -861,9 +896,14 @@ class AzureChatCompletion(BaseLLM):
                     "complete_input_dict": data,
                 },
             )
-            response = await azure_client.chat.completions.create(
-                **data, timeout=timeout
+
+            headers, response = await self.make_azure_openai_chat_completion_request(
+                azure_client=azure_client,
+                data=data,
+                timeout=timeout,
             )
+            logging_obj.model_call_details["response_headers"] = headers
+
             # return response
             streamwrapper = CustomStreamWrapper(
                 completion_stream=response,
diff --git a/litellm/llms/openai.py b/litellm/llms/openai.py
index 32e63b957..990ef2fae 100644
--- a/litellm/llms/openai.py
+++ b/litellm/llms/openai.py
@@ -21,6 +21,7 @@ from pydantic import BaseModel
 from typing_extensions import overload, override
 
 import litellm
+from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
 from litellm.types.utils import ProviderField
 from litellm.utils import (
     Choices,
@@ -652,6 +653,36 @@ class OpenAIChatCompletion(BaseLLM):
         else:
             return client
 
+    async def make_openai_chat_completion_request(
+        self,
+        openai_aclient: AsyncOpenAI,
+        data: dict,
+        timeout: Union[float, httpx.Timeout],
+    ):
+        """
+        Helper to:
+        - call chat.completions.create.with_raw_response when litellm.return_response_headers is True
+        - call chat.completions.create by default
+        """
+        try:
+            if litellm.return_response_headers is True:
+                raw_response = (
+                    await openai_aclient.chat.completions.with_raw_response.create(
+                        **data, timeout=timeout
+                    )
+                )
+
+                headers = dict(raw_response.headers)
+                response = raw_response.parse()
+                return headers, response
+            else:
+                response = await openai_aclient.chat.completions.create(
+                    **data, timeout=timeout
+                )
+                return None, response
+        except Exception as e:
+            raise e
+
     def completion(
         self,
         model_response: ModelResponse,
@@ -836,13 +867,13 @@ class OpenAIChatCompletion(BaseLLM):
         self,
         data: dict,
         model_response: ModelResponse,
+        logging_obj: LiteLLMLoggingObj,
         timeout: Union[float, httpx.Timeout],
         api_key: Optional[str] = None,
         api_base: Optional[str] = None,
         organization: Optional[str] = None,
         client=None,
         max_retries=None,
-        logging_obj=None,
         headers=None,
     ):
         response = None
@@ -869,8 +900,8 @@ class OpenAIChatCompletion(BaseLLM):
                 },
             )
 
-            response = await openai_aclient.chat.completions.create(
-                **data, timeout=timeout
+            headers, response = await self.make_openai_chat_completion_request(
+                openai_aclient=openai_aclient, data=data, timeout=timeout
             )
             stringified_response = response.model_dump()
             logging_obj.post_call(
@@ -879,9 +910,11 @@ class OpenAIChatCompletion(BaseLLM):
                 original_response=stringified_response,
                 additional_args={"complete_input_dict": data},
             )
+            logging_obj.model_call_details["response_headers"] = headers
             return convert_to_model_response_object(
                 response_object=stringified_response,
                 model_response_object=model_response,
+                hidden_params={"headers": headers},
             )
         except Exception as e:
             raise e
@@ -931,10 +964,10 @@ class OpenAIChatCompletion(BaseLLM):
 
     async def async_streaming(
         self,
-        logging_obj,
         timeout: Union[float, httpx.Timeout],
         data: dict,
         model: str,
+        logging_obj: LiteLLMLoggingObj,
         api_key: Optional[str] = None,
         api_base: Optional[str] = None,
         organization: Optional[str] = None,
@@ -965,9 +998,10 @@ class OpenAIChatCompletion(BaseLLM):
                 },
             )
 
-            response = await openai_aclient.chat.completions.create(
-                **data, timeout=timeout
+            headers, response = await self.make_openai_chat_completion_request(
+                openai_aclient=openai_aclient, data=data, timeout=timeout
             )
+            logging_obj.model_call_details["response_headers"] = headers
             streamwrapper = CustomStreamWrapper(
                 completion_stream=response,
                 model=model,
@@ -992,17 +1026,43 @@ class OpenAIChatCompletion(BaseLLM):
                 else:
                     raise OpenAIError(status_code=500, message=f"{str(e)}")
 
+    # Embedding
+    async def make_openai_embedding_request(
+        self,
+        openai_aclient: AsyncOpenAI,
+        data: dict,
+        timeout: Union[float, httpx.Timeout],
+    ):
+        """
+        Helper to:
+        - call embeddings.create.with_raw_response when litellm.return_response_headers is True
+        - call embeddings.create by default
+        """
+        try:
+            if litellm.return_response_headers is True:
+                raw_response = await openai_aclient.embeddings.with_raw_response.create(
+                    **data, timeout=timeout
+                )  # type: ignore
+                headers = dict(raw_response.headers)
+                response = raw_response.parse()
+                return headers, response
+            else:
+                response = await openai_aclient.embeddings.create(**data, timeout=timeout)  # type: ignore
+                return None, response
+        except Exception as e:
+            raise e
+
     async def aembedding(
         self,
         input: list,
         data: dict,
         model_response: litellm.utils.EmbeddingResponse,
         timeout: float,
+        logging_obj: LiteLLMLoggingObj,
         api_key: Optional[str] = None,
         api_base: Optional[str] = None,
         client: Optional[AsyncOpenAI] = None,
         max_retries=None,
-        logging_obj=None,
     ):
         response = None
         try:
@@ -1014,7 +1074,10 @@ class OpenAIChatCompletion(BaseLLM):
                 max_retries=max_retries,
                 client=client,
             )
-            response = await openai_aclient.embeddings.create(**data, timeout=timeout)  # type: ignore
+            headers, response = await self.make_openai_embedding_request(
+                openai_aclient=openai_aclient, data=data, timeout=timeout
+            )
+            logging_obj.model_call_details["response_headers"] = headers
             stringified_response = response.model_dump()
             ## LOGGING
             logging_obj.post_call(
@@ -1229,6 +1292,34 @@ class OpenAIChatCompletion(BaseLLM):
             else:
                 raise OpenAIError(status_code=500, message=str(e))
 
+    # Audio Transcriptions
+    async def make_openai_audio_transcriptions_request(
+        self,
+        openai_aclient: AsyncOpenAI,
+        data: dict,
+        timeout: Union[float, httpx.Timeout],
+    ):
+        """
+        Helper to:
+        - call openai_aclient.audio.transcriptions.with_raw_response when litellm.return_response_headers is True
+        - call openai_aclient.audio.transcriptions.create by default
+        """
+        try:
+            if litellm.return_response_headers is True:
+                raw_response = (
+                    await openai_aclient.audio.transcriptions.with_raw_response.create(
+                        **data, timeout=timeout
+                    )
+                )  # type: ignore
+                headers = dict(raw_response.headers)
+                response = raw_response.parse()
+                return headers, response
+            else:
+                response = await openai_aclient.audio.transcriptions.create(**data, timeout=timeout)  # type: ignore
+                return None, response
+        except Exception as e:
+            raise e
+
     def audio_transcriptions(
         self,
         model: str,
@@ -1286,11 +1377,11 @@ class OpenAIChatCompletion(BaseLLM):
         data: dict,
         model_response: TranscriptionResponse,
         timeout: float,
+        logging_obj: LiteLLMLoggingObj,
         api_key: Optional[str] = None,
         api_base: Optional[str] = None,
         client=None,
         max_retries=None,
-        logging_obj=None,
     ):
         try:
             openai_aclient = self._get_openai_client(
@@ -1302,9 +1393,12 @@ class OpenAIChatCompletion(BaseLLM):
                 client=client,
             )
 
-            response = await openai_aclient.audio.transcriptions.create(
-                **data, timeout=timeout
-            )  # type: ignore
+            headers, response = await self.make_openai_audio_transcriptions_request(
+                openai_aclient=openai_aclient,
+                data=data,
+                timeout=timeout,
+            )
+            logging_obj.model_call_details["response_headers"] = headers
             stringified_response = response.model_dump()
             ## LOGGING
             logging_obj.post_call(
@@ -1497,9 +1591,9 @@ class OpenAITextCompletion(BaseLLM):
         model: str,
         messages: list,
         timeout: float,
+        logging_obj: LiteLLMLoggingObj,
         print_verbose: Optional[Callable] = None,
         api_base: Optional[str] = None,
-        logging_obj=None,
         acompletion: bool = False,
         optional_params=None,
         litellm_params=None,
diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml
index 88b778a6d..9f2324e51 100644
--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@@ -36,6 +36,7 @@ general_settings:
         LANGFUSE_SECRET_KEY: "os.environ/LANGFUSE_DEV_SK_KEY"
 
 litellm_settings:
+  return_response_headers: true
   success_callback: ["prometheus"]
   callbacks: ["otel", "hide_secrets"]
   failure_callback: ["prometheus"]
diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py
index 5138e9b61..1c10ef461 100644
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@@ -23,7 +23,7 @@ from litellm import RateLimitError, Timeout, completion, completion_cost, embedd
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
 from litellm.llms.prompt_templates.factory import anthropic_messages_pt
 
-# litellm.num_retries = 3
+# litellm.num_retries=3
 litellm.cache = None
 litellm.success_callback = []
 user_message = "Write a short poem about the sky"
diff --git a/litellm/tests/test_exceptions.py b/litellm/tests/test_exceptions.py
index 3d8cb3c2a..fb390bb48 100644
--- a/litellm/tests/test_exceptions.py
+++ b/litellm/tests/test_exceptions.py
@@ -249,6 +249,25 @@ def test_completion_azure_exception():
 # test_completion_azure_exception()
 
 
+def test_azure_embedding_exceptions():
+    try:
+
+        response = litellm.embedding(
+            model="azure/azure-embedding-model",
+            input="hello",
+            messages="hello",
+        )
+        pytest.fail(f"Bad request this should have failed but got {response}")
+
+    except Exception as e:
+        print(vars(e))
+        # CRUCIAL Test - Ensures our exceptions are readable and not overly complicated. some users have complained exceptions will randomly have another exception raised in our exception mapping
+        assert (
+            e.message
+            == "litellm.APIError: AzureException APIError - Embeddings.create() got an unexpected keyword argument 'messages'"
+        )
+
+
 async def asynctest_completion_azure_exception():
     try:
         import openai
diff --git a/litellm/utils.py b/litellm/utils.py
index 103f854b6..f8e8566f8 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -5810,6 +5810,18 @@ def exception_type(
                 _model_group = _metadata.get("model_group")
                 _deployment = _metadata.get("deployment")
                 extra_information = f"\nModel: {model}"
+
+                exception_provider = "Unknown"
+                if (
+                    isinstance(custom_llm_provider, str)
+                    and len(custom_llm_provider) > 0
+                ):
+                    exception_provider = (
+                        custom_llm_provider[0].upper()
+                        + custom_llm_provider[1:]
+                        + "Exception"
+                    )
+
                 if _api_base:
                     extra_information += f"\nAPI Base: `{_api_base}`"
                 if (
diff --git a/pyproject.toml b/pyproject.toml
index 2519c167f..c698a18e1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "1.41.2"
+version = "1.41.3"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT"
@@ -90,7 +90,7 @@ requires = ["poetry-core", "wheel"]
 build-backend = "poetry.core.masonry.api"
 
 [tool.commitizen]
-version = "1.41.2"
+version = "1.41.3"
 version_files = [
     "pyproject.toml:^version"
 ]