diff --git a/docs/my-website/docs/enterprise.md b/docs/my-website/docs/enterprise.md index e3758266a..5bd09ec15 100644 --- a/docs/my-website/docs/enterprise.md +++ b/docs/my-website/docs/enterprise.md @@ -20,6 +20,8 @@ This covers: - **Spend Tracking** - ✅ [Tracking Spend for Custom Tags](./proxy/enterprise#tracking-spend-for-custom-tags) - ✅ [API Endpoints to get Spend Reports per Team, API Key, Customer](./proxy/cost_tracking.md#✨-enterprise-api-endpoints-to-get-spend) + - **Advanced Metrics** + - ✅ [`x-ratelimit-remaining-requests`, `x-ratelimit-remaining-tokens` for LLM APIs on Prometheus](./proxy/prometheus#✨-enterprise-llm-remaining-requests-and-remaining-tokens) - **Guardrails, PII Masking, Content Moderation** - ✅ [Content Moderation with LLM Guard, LlamaGuard, Secret Detection, Google Text Moderations](./proxy/enterprise#content-moderation) - ✅ [Prompt Injection Detection (with LakeraAI API)](./proxy/enterprise#prompt-injection-detection---lakeraai) diff --git a/docs/my-website/docs/proxy/enterprise.md b/docs/my-website/docs/proxy/enterprise.md index e061a917e..5dabba5ed 100644 --- a/docs/my-website/docs/proxy/enterprise.md +++ b/docs/my-website/docs/proxy/enterprise.md @@ -23,6 +23,8 @@ Features: - **Spend Tracking** - ✅ [Tracking Spend for Custom Tags](#tracking-spend-for-custom-tags) - ✅ [API Endpoints to get Spend Reports per Team, API Key, Customer](cost_tracking.md#✨-enterprise-api-endpoints-to-get-spend) +- **Advanced Metrics** + - ✅ [`x-ratelimit-remaining-requests`, `x-ratelimit-remaining-tokens` for LLM APIs on Prometheus](prometheus#✨-enterprise-llm-remaining-requests-and-remaining-tokens) - **Guardrails, PII Masking, Content Moderation** - ✅ [Content Moderation with LLM Guard, LlamaGuard, Secret Detection, Google Text Moderations](#content-moderation) - ✅ [Prompt Injection Detection (with LakeraAI API)](#prompt-injection-detection---lakeraai) diff --git a/docs/my-website/docs/proxy/logging.md b/docs/my-website/docs/proxy/logging.md index f9ed5db3d..83bf8ee95 100644 --- a/docs/my-website/docs/proxy/logging.md +++ b/docs/my-website/docs/proxy/logging.md @@ -1188,6 +1188,7 @@ litellm_settings: s3_region_name: us-west-2 # AWS Region Name for S3 s3_aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID # us os.environ/ to pass environment variables. This is AWS Access Key ID for S3 s3_aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY # AWS Secret Access Key for S3 + s3_path: my-test-path # [OPTIONAL] set path in bucket you want to write logs to s3_endpoint_url: https://s3.amazonaws.com # [OPTIONAL] S3 endpoint URL, if you want to use Backblaze/cloudflare s3 buckets ``` diff --git a/docs/my-website/docs/proxy/prometheus.md b/docs/my-website/docs/proxy/prometheus.md index 2c7481f4c..6790b25b0 100644 --- a/docs/my-website/docs/proxy/prometheus.md +++ b/docs/my-website/docs/proxy/prometheus.md @@ -1,3 +1,6 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + # 📈 Prometheus metrics [BETA] LiteLLM Exposes a `/metrics` endpoint for Prometheus to Poll @@ -61,6 +64,56 @@ http://localhost:4000/metrics | `litellm_remaining_api_key_budget_metric` | Remaining Budget for API Key (A key Created on LiteLLM)| +### ✨ (Enterprise) LLM Remaining Requests and Remaining Tokens +Set this on your config.yaml to allow you to track how close you are to hitting your TPM / RPM limits on each model group + +```yaml +litellm_settings: + success_callback: ["prometheus"] + failure_callback: ["prometheus"] + return_response_headers: true # ensures the LLM API calls track the response headers +``` + +| Metric Name | Description | +|----------------------|--------------------------------------| +| `litellm_remaining_requests_metric` | Track `x-ratelimit-remaining-requests` returned from LLM API Deployment | +| `litellm_remaining_tokens` | Track `x-ratelimit-remaining-tokens` return from LLM API Deployment | + +Example Metric + + + + +```shell +litellm_remaining_requests +{ + api_base="https://api.openai.com/v1", + api_provider="openai", + litellm_model_name="gpt-3.5-turbo", + model_group="gpt-3.5-turbo" +} +8998.0 +``` + + + + + +```shell +litellm_remaining_tokens +{ + api_base="https://api.openai.com/v1", + api_provider="openai", + litellm_model_name="gpt-3.5-turbo", + model_group="gpt-3.5-turbo" +} +999981.0 +``` + + + + + ## Monitor System Health To monitor the health of litellm adjacent services (redis / postgres), do: diff --git a/litellm/__init__.py b/litellm/__init__.py index 0fa822a98..a9e6b69ae 100644 --- a/litellm/__init__.py +++ b/litellm/__init__.py @@ -125,6 +125,9 @@ llm_guard_mode: Literal["all", "key-specific", "request-specific"] = "all" ################## ### PREVIEW FEATURES ### enable_preview_features: bool = False +return_response_headers: bool = ( + False # get response headers from LLM Api providers - example x-remaining-requests, +) ################## logging: bool = True caching: bool = ( diff --git a/litellm/integrations/prometheus.py b/litellm/integrations/prometheus.py index 4f0ffa387..6cd746907 100644 --- a/litellm/integrations/prometheus.py +++ b/litellm/integrations/prometheus.py @@ -2,14 +2,20 @@ #### What this does #### # On success, log events to Prometheus -import dotenv, os -import requests # type: ignore +import datetime +import os +import subprocess +import sys import traceback -import datetime, subprocess, sys -import litellm, uuid -from litellm._logging import print_verbose, verbose_logger +import uuid from typing import Optional, Union +import dotenv +import requests # type: ignore + +import litellm +from litellm._logging import print_verbose, verbose_logger + class PrometheusLogger: # Class variables or attributes @@ -20,6 +26,8 @@ class PrometheusLogger: try: from prometheus_client import Counter, Gauge + from litellm.proxy.proxy_server import premium_user + self.litellm_llm_api_failed_requests_metric = Counter( name="litellm_llm_api_failed_requests_metric", documentation="Total number of failed LLM API calls via litellm", @@ -88,6 +96,31 @@ class PrometheusLogger: labelnames=["hashed_api_key", "api_key_alias"], ) + # Litellm-Enterprise Metrics + if premium_user is True: + # Remaining Rate Limit for model + self.litellm_remaining_requests_metric = Gauge( + "litellm_remaining_requests", + "remaining requests for model, returned from LLM API Provider", + labelnames=[ + "model_group", + "api_provider", + "api_base", + "litellm_model_name", + ], + ) + + self.litellm_remaining_tokens_metric = Gauge( + "litellm_remaining_tokens", + "remaining tokens for model, returned from LLM API Provider", + labelnames=[ + "model_group", + "api_provider", + "api_base", + "litellm_model_name", + ], + ) + except Exception as e: print_verbose(f"Got exception on init prometheus client {str(e)}") raise e @@ -104,6 +137,8 @@ class PrometheusLogger: ): try: # Define prometheus client + from litellm.proxy.proxy_server import premium_user + verbose_logger.debug( f"prometheus Logging - Enters logging function for model {kwargs}" ) @@ -199,6 +234,10 @@ class PrometheusLogger: user_api_key, user_api_key_alias ).set(_remaining_api_key_budget) + # set x-ratelimit headers + if premium_user is True: + self.set_remaining_tokens_requests_metric(kwargs) + ### FAILURE INCREMENT ### if "exception" in kwargs: self.litellm_llm_api_failed_requests_metric.labels( @@ -216,6 +255,58 @@ class PrometheusLogger: verbose_logger.debug(traceback.format_exc()) pass + def set_remaining_tokens_requests_metric(self, request_kwargs: dict): + try: + verbose_logger.debug("setting remaining tokens requests metric") + _response_headers = request_kwargs.get("response_headers") + _litellm_params = request_kwargs.get("litellm_params", {}) or {} + _metadata = _litellm_params.get("metadata", {}) + litellm_model_name = request_kwargs.get("model", None) + model_group = _metadata.get("model_group", None) + api_base = _metadata.get("api_base", None) + llm_provider = _litellm_params.get("custom_llm_provider", None) + + remaining_requests = None + remaining_tokens = None + # OpenAI / OpenAI Compatible headers + if ( + _response_headers + and "x-ratelimit-remaining-requests" in _response_headers + ): + remaining_requests = _response_headers["x-ratelimit-remaining-requests"] + if ( + _response_headers + and "x-ratelimit-remaining-tokens" in _response_headers + ): + remaining_tokens = _response_headers["x-ratelimit-remaining-tokens"] + verbose_logger.debug( + f"remaining requests: {remaining_requests}, remaining tokens: {remaining_tokens}" + ) + + if remaining_requests: + """ + "model_group", + "api_provider", + "api_base", + "litellm_model_name" + """ + self.litellm_remaining_requests_metric.labels( + model_group, llm_provider, api_base, litellm_model_name + ).set(remaining_requests) + + if remaining_tokens: + self.litellm_remaining_tokens_metric.labels( + model_group, llm_provider, api_base, litellm_model_name + ).set(remaining_tokens) + + except Exception as e: + verbose_logger.error( + "Prometheus Error: set_remaining_tokens_requests_metric. Exception occured - {}".format( + str(e) + ) + ) + return + def safe_get_remaining_budget( max_budget: Optional[float], spend: Optional[float] diff --git a/litellm/integrations/s3.py b/litellm/integrations/s3.py index 0796d1048..6e8c4a4e4 100644 --- a/litellm/integrations/s3.py +++ b/litellm/integrations/s3.py @@ -1,10 +1,14 @@ #### What this does #### # On success + failure, log events to Supabase +import datetime import os +import subprocess +import sys import traceback -import datetime, subprocess, sys -import litellm, uuid +import uuid + +import litellm from litellm._logging import print_verbose, verbose_logger @@ -54,6 +58,7 @@ class S3Logger: "s3_aws_session_token" ) s3_config = litellm.s3_callback_params.get("s3_config") + s3_path = litellm.s3_callback_params.get("s3_path") # done reading litellm.s3_callback_params self.bucket_name = s3_bucket_name diff --git a/litellm/llms/azure.py b/litellm/llms/azure.py index e127ecea6..000feed44 100644 --- a/litellm/llms/azure.py +++ b/litellm/llms/azure.py @@ -23,6 +23,7 @@ from typing_extensions import overload import litellm from litellm import OpenAIConfig from litellm.caching import DualCache +from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj from litellm.utils import ( Choices, CustomStreamWrapper, @@ -458,6 +459,36 @@ class AzureChatCompletion(BaseLLM): return azure_client + async def make_azure_openai_chat_completion_request( + self, + azure_client: AsyncAzureOpenAI, + data: dict, + timeout: Union[float, httpx.Timeout], + ): + """ + Helper to: + - call chat.completions.create.with_raw_response when litellm.return_response_headers is True + - call chat.completions.create by default + """ + try: + if litellm.return_response_headers is True: + raw_response = ( + await azure_client.chat.completions.with_raw_response.create( + **data, timeout=timeout + ) + ) + + headers = dict(raw_response.headers) + response = raw_response.parse() + return headers, response + else: + response = await azure_client.chat.completions.create( + **data, timeout=timeout + ) + return None, response + except Exception as e: + raise e + def completion( self, model: str, @@ -470,7 +501,7 @@ class AzureChatCompletion(BaseLLM): azure_ad_token: str, print_verbose: Callable, timeout: Union[float, httpx.Timeout], - logging_obj, + logging_obj: LiteLLMLoggingObj, optional_params, litellm_params, logger_fn, @@ -649,9 +680,9 @@ class AzureChatCompletion(BaseLLM): data: dict, timeout: Any, model_response: ModelResponse, + logging_obj: LiteLLMLoggingObj, azure_ad_token: Optional[str] = None, client=None, # this is the AsyncAzureOpenAI - logging_obj=None, ): response = None try: @@ -701,9 +732,13 @@ class AzureChatCompletion(BaseLLM): "complete_input_dict": data, }, ) - response = await azure_client.chat.completions.create( - **data, timeout=timeout + + headers, response = await self.make_azure_openai_chat_completion_request( + azure_client=azure_client, + data=data, + timeout=timeout, ) + logging_obj.model_call_details["response_headers"] = headers stringified_response = response.model_dump() logging_obj.post_call( @@ -812,7 +847,7 @@ class AzureChatCompletion(BaseLLM): async def async_streaming( self, - logging_obj, + logging_obj: LiteLLMLoggingObj, api_base: str, api_key: str, api_version: str, @@ -861,9 +896,14 @@ class AzureChatCompletion(BaseLLM): "complete_input_dict": data, }, ) - response = await azure_client.chat.completions.create( - **data, timeout=timeout + + headers, response = await self.make_azure_openai_chat_completion_request( + azure_client=azure_client, + data=data, + timeout=timeout, ) + logging_obj.model_call_details["response_headers"] = headers + # return response streamwrapper = CustomStreamWrapper( completion_stream=response, diff --git a/litellm/llms/openai.py b/litellm/llms/openai.py index 32e63b957..990ef2fae 100644 --- a/litellm/llms/openai.py +++ b/litellm/llms/openai.py @@ -21,6 +21,7 @@ from pydantic import BaseModel from typing_extensions import overload, override import litellm +from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj from litellm.types.utils import ProviderField from litellm.utils import ( Choices, @@ -652,6 +653,36 @@ class OpenAIChatCompletion(BaseLLM): else: return client + async def make_openai_chat_completion_request( + self, + openai_aclient: AsyncOpenAI, + data: dict, + timeout: Union[float, httpx.Timeout], + ): + """ + Helper to: + - call chat.completions.create.with_raw_response when litellm.return_response_headers is True + - call chat.completions.create by default + """ + try: + if litellm.return_response_headers is True: + raw_response = ( + await openai_aclient.chat.completions.with_raw_response.create( + **data, timeout=timeout + ) + ) + + headers = dict(raw_response.headers) + response = raw_response.parse() + return headers, response + else: + response = await openai_aclient.chat.completions.create( + **data, timeout=timeout + ) + return None, response + except Exception as e: + raise e + def completion( self, model_response: ModelResponse, @@ -836,13 +867,13 @@ class OpenAIChatCompletion(BaseLLM): self, data: dict, model_response: ModelResponse, + logging_obj: LiteLLMLoggingObj, timeout: Union[float, httpx.Timeout], api_key: Optional[str] = None, api_base: Optional[str] = None, organization: Optional[str] = None, client=None, max_retries=None, - logging_obj=None, headers=None, ): response = None @@ -869,8 +900,8 @@ class OpenAIChatCompletion(BaseLLM): }, ) - response = await openai_aclient.chat.completions.create( - **data, timeout=timeout + headers, response = await self.make_openai_chat_completion_request( + openai_aclient=openai_aclient, data=data, timeout=timeout ) stringified_response = response.model_dump() logging_obj.post_call( @@ -879,9 +910,11 @@ class OpenAIChatCompletion(BaseLLM): original_response=stringified_response, additional_args={"complete_input_dict": data}, ) + logging_obj.model_call_details["response_headers"] = headers return convert_to_model_response_object( response_object=stringified_response, model_response_object=model_response, + hidden_params={"headers": headers}, ) except Exception as e: raise e @@ -931,10 +964,10 @@ class OpenAIChatCompletion(BaseLLM): async def async_streaming( self, - logging_obj, timeout: Union[float, httpx.Timeout], data: dict, model: str, + logging_obj: LiteLLMLoggingObj, api_key: Optional[str] = None, api_base: Optional[str] = None, organization: Optional[str] = None, @@ -965,9 +998,10 @@ class OpenAIChatCompletion(BaseLLM): }, ) - response = await openai_aclient.chat.completions.create( - **data, timeout=timeout + headers, response = await self.make_openai_chat_completion_request( + openai_aclient=openai_aclient, data=data, timeout=timeout ) + logging_obj.model_call_details["response_headers"] = headers streamwrapper = CustomStreamWrapper( completion_stream=response, model=model, @@ -992,17 +1026,43 @@ class OpenAIChatCompletion(BaseLLM): else: raise OpenAIError(status_code=500, message=f"{str(e)}") + # Embedding + async def make_openai_embedding_request( + self, + openai_aclient: AsyncOpenAI, + data: dict, + timeout: Union[float, httpx.Timeout], + ): + """ + Helper to: + - call embeddings.create.with_raw_response when litellm.return_response_headers is True + - call embeddings.create by default + """ + try: + if litellm.return_response_headers is True: + raw_response = await openai_aclient.embeddings.with_raw_response.create( + **data, timeout=timeout + ) # type: ignore + headers = dict(raw_response.headers) + response = raw_response.parse() + return headers, response + else: + response = await openai_aclient.embeddings.create(**data, timeout=timeout) # type: ignore + return None, response + except Exception as e: + raise e + async def aembedding( self, input: list, data: dict, model_response: litellm.utils.EmbeddingResponse, timeout: float, + logging_obj: LiteLLMLoggingObj, api_key: Optional[str] = None, api_base: Optional[str] = None, client: Optional[AsyncOpenAI] = None, max_retries=None, - logging_obj=None, ): response = None try: @@ -1014,7 +1074,10 @@ class OpenAIChatCompletion(BaseLLM): max_retries=max_retries, client=client, ) - response = await openai_aclient.embeddings.create(**data, timeout=timeout) # type: ignore + headers, response = await self.make_openai_embedding_request( + openai_aclient=openai_aclient, data=data, timeout=timeout + ) + logging_obj.model_call_details["response_headers"] = headers stringified_response = response.model_dump() ## LOGGING logging_obj.post_call( @@ -1229,6 +1292,34 @@ class OpenAIChatCompletion(BaseLLM): else: raise OpenAIError(status_code=500, message=str(e)) + # Audio Transcriptions + async def make_openai_audio_transcriptions_request( + self, + openai_aclient: AsyncOpenAI, + data: dict, + timeout: Union[float, httpx.Timeout], + ): + """ + Helper to: + - call openai_aclient.audio.transcriptions.with_raw_response when litellm.return_response_headers is True + - call openai_aclient.audio.transcriptions.create by default + """ + try: + if litellm.return_response_headers is True: + raw_response = ( + await openai_aclient.audio.transcriptions.with_raw_response.create( + **data, timeout=timeout + ) + ) # type: ignore + headers = dict(raw_response.headers) + response = raw_response.parse() + return headers, response + else: + response = await openai_aclient.audio.transcriptions.create(**data, timeout=timeout) # type: ignore + return None, response + except Exception as e: + raise e + def audio_transcriptions( self, model: str, @@ -1286,11 +1377,11 @@ class OpenAIChatCompletion(BaseLLM): data: dict, model_response: TranscriptionResponse, timeout: float, + logging_obj: LiteLLMLoggingObj, api_key: Optional[str] = None, api_base: Optional[str] = None, client=None, max_retries=None, - logging_obj=None, ): try: openai_aclient = self._get_openai_client( @@ -1302,9 +1393,12 @@ class OpenAIChatCompletion(BaseLLM): client=client, ) - response = await openai_aclient.audio.transcriptions.create( - **data, timeout=timeout - ) # type: ignore + headers, response = await self.make_openai_audio_transcriptions_request( + openai_aclient=openai_aclient, + data=data, + timeout=timeout, + ) + logging_obj.model_call_details["response_headers"] = headers stringified_response = response.model_dump() ## LOGGING logging_obj.post_call( @@ -1497,9 +1591,9 @@ class OpenAITextCompletion(BaseLLM): model: str, messages: list, timeout: float, + logging_obj: LiteLLMLoggingObj, print_verbose: Optional[Callable] = None, api_base: Optional[str] = None, - logging_obj=None, acompletion: bool = False, optional_params=None, litellm_params=None, diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml index 88b778a6d..9f2324e51 100644 --- a/litellm/proxy/proxy_config.yaml +++ b/litellm/proxy/proxy_config.yaml @@ -36,6 +36,7 @@ general_settings: LANGFUSE_SECRET_KEY: "os.environ/LANGFUSE_DEV_SK_KEY" litellm_settings: + return_response_headers: true success_callback: ["prometheus"] callbacks: ["otel", "hide_secrets"] failure_callback: ["prometheus"] diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py index 5138e9b61..1c10ef461 100644 --- a/litellm/tests/test_completion.py +++ b/litellm/tests/test_completion.py @@ -23,7 +23,7 @@ from litellm import RateLimitError, Timeout, completion, completion_cost, embedd from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler from litellm.llms.prompt_templates.factory import anthropic_messages_pt -# litellm.num_retries = 3 +# litellm.num_retries=3 litellm.cache = None litellm.success_callback = [] user_message = "Write a short poem about the sky" diff --git a/litellm/tests/test_exceptions.py b/litellm/tests/test_exceptions.py index 3d8cb3c2a..fb390bb48 100644 --- a/litellm/tests/test_exceptions.py +++ b/litellm/tests/test_exceptions.py @@ -249,6 +249,25 @@ def test_completion_azure_exception(): # test_completion_azure_exception() +def test_azure_embedding_exceptions(): + try: + + response = litellm.embedding( + model="azure/azure-embedding-model", + input="hello", + messages="hello", + ) + pytest.fail(f"Bad request this should have failed but got {response}") + + except Exception as e: + print(vars(e)) + # CRUCIAL Test - Ensures our exceptions are readable and not overly complicated. some users have complained exceptions will randomly have another exception raised in our exception mapping + assert ( + e.message + == "litellm.APIError: AzureException APIError - Embeddings.create() got an unexpected keyword argument 'messages'" + ) + + async def asynctest_completion_azure_exception(): try: import openai diff --git a/litellm/utils.py b/litellm/utils.py index 103f854b6..f8e8566f8 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -5810,6 +5810,18 @@ def exception_type( _model_group = _metadata.get("model_group") _deployment = _metadata.get("deployment") extra_information = f"\nModel: {model}" + + exception_provider = "Unknown" + if ( + isinstance(custom_llm_provider, str) + and len(custom_llm_provider) > 0 + ): + exception_provider = ( + custom_llm_provider[0].upper() + + custom_llm_provider[1:] + + "Exception" + ) + if _api_base: extra_information += f"\nAPI Base: `{_api_base}`" if ( diff --git a/pyproject.toml b/pyproject.toml index 2519c167f..c698a18e1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "litellm" -version = "1.41.2" +version = "1.41.3" description = "Library to easily interface with LLM API providers" authors = ["BerriAI"] license = "MIT" @@ -90,7 +90,7 @@ requires = ["poetry-core", "wheel"] build-backend = "poetry.core.masonry.api" [tool.commitizen] -version = "1.41.2" +version = "1.41.3" version_files = [ "pyproject.toml:^version" ]