Merge branch 'main' into litellm_disable_cooldowns

This commit is contained in:
Krish Dholakia 2024-07-01 23:10:10 -07:00 committed by GitHub
commit 011e14eb08
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
14 changed files with 353 additions and 30 deletions

View file

@ -20,6 +20,8 @@ This covers:
- **Spend Tracking** - **Spend Tracking**
- ✅ [Tracking Spend for Custom Tags](./proxy/enterprise#tracking-spend-for-custom-tags) - ✅ [Tracking Spend for Custom Tags](./proxy/enterprise#tracking-spend-for-custom-tags)
- ✅ [API Endpoints to get Spend Reports per Team, API Key, Customer](./proxy/cost_tracking.md#✨-enterprise-api-endpoints-to-get-spend) - ✅ [API Endpoints to get Spend Reports per Team, API Key, Customer](./proxy/cost_tracking.md#✨-enterprise-api-endpoints-to-get-spend)
- **Advanced Metrics**
- ✅ [`x-ratelimit-remaining-requests`, `x-ratelimit-remaining-tokens` for LLM APIs on Prometheus](./proxy/prometheus#✨-enterprise-llm-remaining-requests-and-remaining-tokens)
- **Guardrails, PII Masking, Content Moderation** - **Guardrails, PII Masking, Content Moderation**
- ✅ [Content Moderation with LLM Guard, LlamaGuard, Secret Detection, Google Text Moderations](./proxy/enterprise#content-moderation) - ✅ [Content Moderation with LLM Guard, LlamaGuard, Secret Detection, Google Text Moderations](./proxy/enterprise#content-moderation)
- ✅ [Prompt Injection Detection (with LakeraAI API)](./proxy/enterprise#prompt-injection-detection---lakeraai) - ✅ [Prompt Injection Detection (with LakeraAI API)](./proxy/enterprise#prompt-injection-detection---lakeraai)

View file

@ -23,6 +23,8 @@ Features:
- **Spend Tracking** - **Spend Tracking**
- ✅ [Tracking Spend for Custom Tags](#tracking-spend-for-custom-tags) - ✅ [Tracking Spend for Custom Tags](#tracking-spend-for-custom-tags)
- ✅ [API Endpoints to get Spend Reports per Team, API Key, Customer](cost_tracking.md#✨-enterprise-api-endpoints-to-get-spend) - ✅ [API Endpoints to get Spend Reports per Team, API Key, Customer](cost_tracking.md#✨-enterprise-api-endpoints-to-get-spend)
- **Advanced Metrics**
- ✅ [`x-ratelimit-remaining-requests`, `x-ratelimit-remaining-tokens` for LLM APIs on Prometheus](prometheus#✨-enterprise-llm-remaining-requests-and-remaining-tokens)
- **Guardrails, PII Masking, Content Moderation** - **Guardrails, PII Masking, Content Moderation**
- ✅ [Content Moderation with LLM Guard, LlamaGuard, Secret Detection, Google Text Moderations](#content-moderation) - ✅ [Content Moderation with LLM Guard, LlamaGuard, Secret Detection, Google Text Moderations](#content-moderation)
- ✅ [Prompt Injection Detection (with LakeraAI API)](#prompt-injection-detection---lakeraai) - ✅ [Prompt Injection Detection (with LakeraAI API)](#prompt-injection-detection---lakeraai)

View file

@ -1188,6 +1188,7 @@ litellm_settings:
s3_region_name: us-west-2 # AWS Region Name for S3 s3_region_name: us-west-2 # AWS Region Name for S3
s3_aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID # us os.environ/<variable name> to pass environment variables. This is AWS Access Key ID for S3 s3_aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID # us os.environ/<variable name> to pass environment variables. This is AWS Access Key ID for S3
s3_aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY # AWS Secret Access Key for S3 s3_aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY # AWS Secret Access Key for S3
s3_path: my-test-path # [OPTIONAL] set path in bucket you want to write logs to
s3_endpoint_url: https://s3.amazonaws.com # [OPTIONAL] S3 endpoint URL, if you want to use Backblaze/cloudflare s3 buckets s3_endpoint_url: https://s3.amazonaws.com # [OPTIONAL] S3 endpoint URL, if you want to use Backblaze/cloudflare s3 buckets
``` ```

View file

@ -1,3 +1,6 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# 📈 Prometheus metrics [BETA] # 📈 Prometheus metrics [BETA]
LiteLLM Exposes a `/metrics` endpoint for Prometheus to Poll LiteLLM Exposes a `/metrics` endpoint for Prometheus to Poll
@ -61,6 +64,56 @@ http://localhost:4000/metrics
| `litellm_remaining_api_key_budget_metric` | Remaining Budget for API Key (A key Created on LiteLLM)| | `litellm_remaining_api_key_budget_metric` | Remaining Budget for API Key (A key Created on LiteLLM)|
### ✨ (Enterprise) LLM Remaining Requests and Remaining Tokens
Set this on your config.yaml to allow you to track how close you are to hitting your TPM / RPM limits on each model group
```yaml
litellm_settings:
success_callback: ["prometheus"]
failure_callback: ["prometheus"]
return_response_headers: true # ensures the LLM API calls track the response headers
```
| Metric Name | Description |
|----------------------|--------------------------------------|
| `litellm_remaining_requests_metric` | Track `x-ratelimit-remaining-requests` returned from LLM API Deployment |
| `litellm_remaining_tokens` | Track `x-ratelimit-remaining-tokens` return from LLM API Deployment |
Example Metric
<Tabs>
<TabItem value="Remaining Requests" label="Remaining Requests">
```shell
litellm_remaining_requests
{
api_base="https://api.openai.com/v1",
api_provider="openai",
litellm_model_name="gpt-3.5-turbo",
model_group="gpt-3.5-turbo"
}
8998.0
```
</TabItem>
<TabItem value="Requests" label="Remaining Tokens">
```shell
litellm_remaining_tokens
{
api_base="https://api.openai.com/v1",
api_provider="openai",
litellm_model_name="gpt-3.5-turbo",
model_group="gpt-3.5-turbo"
}
999981.0
```
</TabItem>
</Tabs>
## Monitor System Health ## Monitor System Health
To monitor the health of litellm adjacent services (redis / postgres), do: To monitor the health of litellm adjacent services (redis / postgres), do:

View file

@ -125,6 +125,9 @@ llm_guard_mode: Literal["all", "key-specific", "request-specific"] = "all"
################## ##################
### PREVIEW FEATURES ### ### PREVIEW FEATURES ###
enable_preview_features: bool = False enable_preview_features: bool = False
return_response_headers: bool = (
False # get response headers from LLM Api providers - example x-remaining-requests,
)
################## ##################
logging: bool = True logging: bool = True
caching: bool = ( caching: bool = (

View file

@ -2,14 +2,20 @@
#### What this does #### #### What this does ####
# On success, log events to Prometheus # On success, log events to Prometheus
import dotenv, os import datetime
import requests # type: ignore import os
import subprocess
import sys
import traceback import traceback
import datetime, subprocess, sys import uuid
import litellm, uuid
from litellm._logging import print_verbose, verbose_logger
from typing import Optional, Union from typing import Optional, Union
import dotenv
import requests # type: ignore
import litellm
from litellm._logging import print_verbose, verbose_logger
class PrometheusLogger: class PrometheusLogger:
# Class variables or attributes # Class variables or attributes
@ -20,6 +26,8 @@ class PrometheusLogger:
try: try:
from prometheus_client import Counter, Gauge from prometheus_client import Counter, Gauge
from litellm.proxy.proxy_server import premium_user
self.litellm_llm_api_failed_requests_metric = Counter( self.litellm_llm_api_failed_requests_metric = Counter(
name="litellm_llm_api_failed_requests_metric", name="litellm_llm_api_failed_requests_metric",
documentation="Total number of failed LLM API calls via litellm", documentation="Total number of failed LLM API calls via litellm",
@ -88,6 +96,31 @@ class PrometheusLogger:
labelnames=["hashed_api_key", "api_key_alias"], labelnames=["hashed_api_key", "api_key_alias"],
) )
# Litellm-Enterprise Metrics
if premium_user is True:
# Remaining Rate Limit for model
self.litellm_remaining_requests_metric = Gauge(
"litellm_remaining_requests",
"remaining requests for model, returned from LLM API Provider",
labelnames=[
"model_group",
"api_provider",
"api_base",
"litellm_model_name",
],
)
self.litellm_remaining_tokens_metric = Gauge(
"litellm_remaining_tokens",
"remaining tokens for model, returned from LLM API Provider",
labelnames=[
"model_group",
"api_provider",
"api_base",
"litellm_model_name",
],
)
except Exception as e: except Exception as e:
print_verbose(f"Got exception on init prometheus client {str(e)}") print_verbose(f"Got exception on init prometheus client {str(e)}")
raise e raise e
@ -104,6 +137,8 @@ class PrometheusLogger:
): ):
try: try:
# Define prometheus client # Define prometheus client
from litellm.proxy.proxy_server import premium_user
verbose_logger.debug( verbose_logger.debug(
f"prometheus Logging - Enters logging function for model {kwargs}" f"prometheus Logging - Enters logging function for model {kwargs}"
) )
@ -199,6 +234,10 @@ class PrometheusLogger:
user_api_key, user_api_key_alias user_api_key, user_api_key_alias
).set(_remaining_api_key_budget) ).set(_remaining_api_key_budget)
# set x-ratelimit headers
if premium_user is True:
self.set_remaining_tokens_requests_metric(kwargs)
### FAILURE INCREMENT ### ### FAILURE INCREMENT ###
if "exception" in kwargs: if "exception" in kwargs:
self.litellm_llm_api_failed_requests_metric.labels( self.litellm_llm_api_failed_requests_metric.labels(
@ -216,6 +255,58 @@ class PrometheusLogger:
verbose_logger.debug(traceback.format_exc()) verbose_logger.debug(traceback.format_exc())
pass pass
def set_remaining_tokens_requests_metric(self, request_kwargs: dict):
try:
verbose_logger.debug("setting remaining tokens requests metric")
_response_headers = request_kwargs.get("response_headers")
_litellm_params = request_kwargs.get("litellm_params", {}) or {}
_metadata = _litellm_params.get("metadata", {})
litellm_model_name = request_kwargs.get("model", None)
model_group = _metadata.get("model_group", None)
api_base = _metadata.get("api_base", None)
llm_provider = _litellm_params.get("custom_llm_provider", None)
remaining_requests = None
remaining_tokens = None
# OpenAI / OpenAI Compatible headers
if (
_response_headers
and "x-ratelimit-remaining-requests" in _response_headers
):
remaining_requests = _response_headers["x-ratelimit-remaining-requests"]
if (
_response_headers
and "x-ratelimit-remaining-tokens" in _response_headers
):
remaining_tokens = _response_headers["x-ratelimit-remaining-tokens"]
verbose_logger.debug(
f"remaining requests: {remaining_requests}, remaining tokens: {remaining_tokens}"
)
if remaining_requests:
"""
"model_group",
"api_provider",
"api_base",
"litellm_model_name"
"""
self.litellm_remaining_requests_metric.labels(
model_group, llm_provider, api_base, litellm_model_name
).set(remaining_requests)
if remaining_tokens:
self.litellm_remaining_tokens_metric.labels(
model_group, llm_provider, api_base, litellm_model_name
).set(remaining_tokens)
except Exception as e:
verbose_logger.error(
"Prometheus Error: set_remaining_tokens_requests_metric. Exception occured - {}".format(
str(e)
)
)
return
def safe_get_remaining_budget( def safe_get_remaining_budget(
max_budget: Optional[float], spend: Optional[float] max_budget: Optional[float], spend: Optional[float]

View file

@ -1,10 +1,14 @@
#### What this does #### #### What this does ####
# On success + failure, log events to Supabase # On success + failure, log events to Supabase
import datetime
import os import os
import subprocess
import sys
import traceback import traceback
import datetime, subprocess, sys import uuid
import litellm, uuid
import litellm
from litellm._logging import print_verbose, verbose_logger from litellm._logging import print_verbose, verbose_logger
@ -54,6 +58,7 @@ class S3Logger:
"s3_aws_session_token" "s3_aws_session_token"
) )
s3_config = litellm.s3_callback_params.get("s3_config") s3_config = litellm.s3_callback_params.get("s3_config")
s3_path = litellm.s3_callback_params.get("s3_path")
# done reading litellm.s3_callback_params # done reading litellm.s3_callback_params
self.bucket_name = s3_bucket_name self.bucket_name = s3_bucket_name

View file

@ -23,6 +23,7 @@ from typing_extensions import overload
import litellm import litellm
from litellm import OpenAIConfig from litellm import OpenAIConfig
from litellm.caching import DualCache from litellm.caching import DualCache
from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
from litellm.utils import ( from litellm.utils import (
Choices, Choices,
CustomStreamWrapper, CustomStreamWrapper,
@ -458,6 +459,36 @@ class AzureChatCompletion(BaseLLM):
return azure_client return azure_client
async def make_azure_openai_chat_completion_request(
self,
azure_client: AsyncAzureOpenAI,
data: dict,
timeout: Union[float, httpx.Timeout],
):
"""
Helper to:
- call chat.completions.create.with_raw_response when litellm.return_response_headers is True
- call chat.completions.create by default
"""
try:
if litellm.return_response_headers is True:
raw_response = (
await azure_client.chat.completions.with_raw_response.create(
**data, timeout=timeout
)
)
headers = dict(raw_response.headers)
response = raw_response.parse()
return headers, response
else:
response = await azure_client.chat.completions.create(
**data, timeout=timeout
)
return None, response
except Exception as e:
raise e
def completion( def completion(
self, self,
model: str, model: str,
@ -470,7 +501,7 @@ class AzureChatCompletion(BaseLLM):
azure_ad_token: str, azure_ad_token: str,
print_verbose: Callable, print_verbose: Callable,
timeout: Union[float, httpx.Timeout], timeout: Union[float, httpx.Timeout],
logging_obj, logging_obj: LiteLLMLoggingObj,
optional_params, optional_params,
litellm_params, litellm_params,
logger_fn, logger_fn,
@ -649,9 +680,9 @@ class AzureChatCompletion(BaseLLM):
data: dict, data: dict,
timeout: Any, timeout: Any,
model_response: ModelResponse, model_response: ModelResponse,
logging_obj: LiteLLMLoggingObj,
azure_ad_token: Optional[str] = None, azure_ad_token: Optional[str] = None,
client=None, # this is the AsyncAzureOpenAI client=None, # this is the AsyncAzureOpenAI
logging_obj=None,
): ):
response = None response = None
try: try:
@ -701,9 +732,13 @@ class AzureChatCompletion(BaseLLM):
"complete_input_dict": data, "complete_input_dict": data,
}, },
) )
response = await azure_client.chat.completions.create(
**data, timeout=timeout headers, response = await self.make_azure_openai_chat_completion_request(
azure_client=azure_client,
data=data,
timeout=timeout,
) )
logging_obj.model_call_details["response_headers"] = headers
stringified_response = response.model_dump() stringified_response = response.model_dump()
logging_obj.post_call( logging_obj.post_call(
@ -812,7 +847,7 @@ class AzureChatCompletion(BaseLLM):
async def async_streaming( async def async_streaming(
self, self,
logging_obj, logging_obj: LiteLLMLoggingObj,
api_base: str, api_base: str,
api_key: str, api_key: str,
api_version: str, api_version: str,
@ -861,9 +896,14 @@ class AzureChatCompletion(BaseLLM):
"complete_input_dict": data, "complete_input_dict": data,
}, },
) )
response = await azure_client.chat.completions.create(
**data, timeout=timeout headers, response = await self.make_azure_openai_chat_completion_request(
azure_client=azure_client,
data=data,
timeout=timeout,
) )
logging_obj.model_call_details["response_headers"] = headers
# return response # return response
streamwrapper = CustomStreamWrapper( streamwrapper = CustomStreamWrapper(
completion_stream=response, completion_stream=response,

View file

@ -21,6 +21,7 @@ from pydantic import BaseModel
from typing_extensions import overload, override from typing_extensions import overload, override
import litellm import litellm
from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
from litellm.types.utils import ProviderField from litellm.types.utils import ProviderField
from litellm.utils import ( from litellm.utils import (
Choices, Choices,
@ -652,6 +653,36 @@ class OpenAIChatCompletion(BaseLLM):
else: else:
return client return client
async def make_openai_chat_completion_request(
self,
openai_aclient: AsyncOpenAI,
data: dict,
timeout: Union[float, httpx.Timeout],
):
"""
Helper to:
- call chat.completions.create.with_raw_response when litellm.return_response_headers is True
- call chat.completions.create by default
"""
try:
if litellm.return_response_headers is True:
raw_response = (
await openai_aclient.chat.completions.with_raw_response.create(
**data, timeout=timeout
)
)
headers = dict(raw_response.headers)
response = raw_response.parse()
return headers, response
else:
response = await openai_aclient.chat.completions.create(
**data, timeout=timeout
)
return None, response
except Exception as e:
raise e
def completion( def completion(
self, self,
model_response: ModelResponse, model_response: ModelResponse,
@ -836,13 +867,13 @@ class OpenAIChatCompletion(BaseLLM):
self, self,
data: dict, data: dict,
model_response: ModelResponse, model_response: ModelResponse,
logging_obj: LiteLLMLoggingObj,
timeout: Union[float, httpx.Timeout], timeout: Union[float, httpx.Timeout],
api_key: Optional[str] = None, api_key: Optional[str] = None,
api_base: Optional[str] = None, api_base: Optional[str] = None,
organization: Optional[str] = None, organization: Optional[str] = None,
client=None, client=None,
max_retries=None, max_retries=None,
logging_obj=None,
headers=None, headers=None,
): ):
response = None response = None
@ -869,8 +900,8 @@ class OpenAIChatCompletion(BaseLLM):
}, },
) )
response = await openai_aclient.chat.completions.create( headers, response = await self.make_openai_chat_completion_request(
**data, timeout=timeout openai_aclient=openai_aclient, data=data, timeout=timeout
) )
stringified_response = response.model_dump() stringified_response = response.model_dump()
logging_obj.post_call( logging_obj.post_call(
@ -879,9 +910,11 @@ class OpenAIChatCompletion(BaseLLM):
original_response=stringified_response, original_response=stringified_response,
additional_args={"complete_input_dict": data}, additional_args={"complete_input_dict": data},
) )
logging_obj.model_call_details["response_headers"] = headers
return convert_to_model_response_object( return convert_to_model_response_object(
response_object=stringified_response, response_object=stringified_response,
model_response_object=model_response, model_response_object=model_response,
hidden_params={"headers": headers},
) )
except Exception as e: except Exception as e:
raise e raise e
@ -931,10 +964,10 @@ class OpenAIChatCompletion(BaseLLM):
async def async_streaming( async def async_streaming(
self, self,
logging_obj,
timeout: Union[float, httpx.Timeout], timeout: Union[float, httpx.Timeout],
data: dict, data: dict,
model: str, model: str,
logging_obj: LiteLLMLoggingObj,
api_key: Optional[str] = None, api_key: Optional[str] = None,
api_base: Optional[str] = None, api_base: Optional[str] = None,
organization: Optional[str] = None, organization: Optional[str] = None,
@ -965,9 +998,10 @@ class OpenAIChatCompletion(BaseLLM):
}, },
) )
response = await openai_aclient.chat.completions.create( headers, response = await self.make_openai_chat_completion_request(
**data, timeout=timeout openai_aclient=openai_aclient, data=data, timeout=timeout
) )
logging_obj.model_call_details["response_headers"] = headers
streamwrapper = CustomStreamWrapper( streamwrapper = CustomStreamWrapper(
completion_stream=response, completion_stream=response,
model=model, model=model,
@ -992,17 +1026,43 @@ class OpenAIChatCompletion(BaseLLM):
else: else:
raise OpenAIError(status_code=500, message=f"{str(e)}") raise OpenAIError(status_code=500, message=f"{str(e)}")
# Embedding
async def make_openai_embedding_request(
self,
openai_aclient: AsyncOpenAI,
data: dict,
timeout: Union[float, httpx.Timeout],
):
"""
Helper to:
- call embeddings.create.with_raw_response when litellm.return_response_headers is True
- call embeddings.create by default
"""
try:
if litellm.return_response_headers is True:
raw_response = await openai_aclient.embeddings.with_raw_response.create(
**data, timeout=timeout
) # type: ignore
headers = dict(raw_response.headers)
response = raw_response.parse()
return headers, response
else:
response = await openai_aclient.embeddings.create(**data, timeout=timeout) # type: ignore
return None, response
except Exception as e:
raise e
async def aembedding( async def aembedding(
self, self,
input: list, input: list,
data: dict, data: dict,
model_response: litellm.utils.EmbeddingResponse, model_response: litellm.utils.EmbeddingResponse,
timeout: float, timeout: float,
logging_obj: LiteLLMLoggingObj,
api_key: Optional[str] = None, api_key: Optional[str] = None,
api_base: Optional[str] = None, api_base: Optional[str] = None,
client: Optional[AsyncOpenAI] = None, client: Optional[AsyncOpenAI] = None,
max_retries=None, max_retries=None,
logging_obj=None,
): ):
response = None response = None
try: try:
@ -1014,7 +1074,10 @@ class OpenAIChatCompletion(BaseLLM):
max_retries=max_retries, max_retries=max_retries,
client=client, client=client,
) )
response = await openai_aclient.embeddings.create(**data, timeout=timeout) # type: ignore headers, response = await self.make_openai_embedding_request(
openai_aclient=openai_aclient, data=data, timeout=timeout
)
logging_obj.model_call_details["response_headers"] = headers
stringified_response = response.model_dump() stringified_response = response.model_dump()
## LOGGING ## LOGGING
logging_obj.post_call( logging_obj.post_call(
@ -1229,6 +1292,34 @@ class OpenAIChatCompletion(BaseLLM):
else: else:
raise OpenAIError(status_code=500, message=str(e)) raise OpenAIError(status_code=500, message=str(e))
# Audio Transcriptions
async def make_openai_audio_transcriptions_request(
self,
openai_aclient: AsyncOpenAI,
data: dict,
timeout: Union[float, httpx.Timeout],
):
"""
Helper to:
- call openai_aclient.audio.transcriptions.with_raw_response when litellm.return_response_headers is True
- call openai_aclient.audio.transcriptions.create by default
"""
try:
if litellm.return_response_headers is True:
raw_response = (
await openai_aclient.audio.transcriptions.with_raw_response.create(
**data, timeout=timeout
)
) # type: ignore
headers = dict(raw_response.headers)
response = raw_response.parse()
return headers, response
else:
response = await openai_aclient.audio.transcriptions.create(**data, timeout=timeout) # type: ignore
return None, response
except Exception as e:
raise e
def audio_transcriptions( def audio_transcriptions(
self, self,
model: str, model: str,
@ -1286,11 +1377,11 @@ class OpenAIChatCompletion(BaseLLM):
data: dict, data: dict,
model_response: TranscriptionResponse, model_response: TranscriptionResponse,
timeout: float, timeout: float,
logging_obj: LiteLLMLoggingObj,
api_key: Optional[str] = None, api_key: Optional[str] = None,
api_base: Optional[str] = None, api_base: Optional[str] = None,
client=None, client=None,
max_retries=None, max_retries=None,
logging_obj=None,
): ):
try: try:
openai_aclient = self._get_openai_client( openai_aclient = self._get_openai_client(
@ -1302,9 +1393,12 @@ class OpenAIChatCompletion(BaseLLM):
client=client, client=client,
) )
response = await openai_aclient.audio.transcriptions.create( headers, response = await self.make_openai_audio_transcriptions_request(
**data, timeout=timeout openai_aclient=openai_aclient,
) # type: ignore data=data,
timeout=timeout,
)
logging_obj.model_call_details["response_headers"] = headers
stringified_response = response.model_dump() stringified_response = response.model_dump()
## LOGGING ## LOGGING
logging_obj.post_call( logging_obj.post_call(
@ -1497,9 +1591,9 @@ class OpenAITextCompletion(BaseLLM):
model: str, model: str,
messages: list, messages: list,
timeout: float, timeout: float,
logging_obj: LiteLLMLoggingObj,
print_verbose: Optional[Callable] = None, print_verbose: Optional[Callable] = None,
api_base: Optional[str] = None, api_base: Optional[str] = None,
logging_obj=None,
acompletion: bool = False, acompletion: bool = False,
optional_params=None, optional_params=None,
litellm_params=None, litellm_params=None,

View file

@ -36,6 +36,7 @@ general_settings:
LANGFUSE_SECRET_KEY: "os.environ/LANGFUSE_DEV_SK_KEY" LANGFUSE_SECRET_KEY: "os.environ/LANGFUSE_DEV_SK_KEY"
litellm_settings: litellm_settings:
return_response_headers: true
success_callback: ["prometheus"] success_callback: ["prometheus"]
callbacks: ["otel", "hide_secrets"] callbacks: ["otel", "hide_secrets"]
failure_callback: ["prometheus"] failure_callback: ["prometheus"]

View file

@ -23,7 +23,7 @@ from litellm import RateLimitError, Timeout, completion, completion_cost, embedd
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
from litellm.llms.prompt_templates.factory import anthropic_messages_pt from litellm.llms.prompt_templates.factory import anthropic_messages_pt
# litellm.num_retries = 3 # litellm.num_retries=3
litellm.cache = None litellm.cache = None
litellm.success_callback = [] litellm.success_callback = []
user_message = "Write a short poem about the sky" user_message = "Write a short poem about the sky"

View file

@ -249,6 +249,25 @@ def test_completion_azure_exception():
# test_completion_azure_exception() # test_completion_azure_exception()
def test_azure_embedding_exceptions():
try:
response = litellm.embedding(
model="azure/azure-embedding-model",
input="hello",
messages="hello",
)
pytest.fail(f"Bad request this should have failed but got {response}")
except Exception as e:
print(vars(e))
# CRUCIAL Test - Ensures our exceptions are readable and not overly complicated. some users have complained exceptions will randomly have another exception raised in our exception mapping
assert (
e.message
== "litellm.APIError: AzureException APIError - Embeddings.create() got an unexpected keyword argument 'messages'"
)
async def asynctest_completion_azure_exception(): async def asynctest_completion_azure_exception():
try: try:
import openai import openai

View file

@ -5810,6 +5810,18 @@ def exception_type(
_model_group = _metadata.get("model_group") _model_group = _metadata.get("model_group")
_deployment = _metadata.get("deployment") _deployment = _metadata.get("deployment")
extra_information = f"\nModel: {model}" extra_information = f"\nModel: {model}"
exception_provider = "Unknown"
if (
isinstance(custom_llm_provider, str)
and len(custom_llm_provider) > 0
):
exception_provider = (
custom_llm_provider[0].upper()
+ custom_llm_provider[1:]
+ "Exception"
)
if _api_base: if _api_base:
extra_information += f"\nAPI Base: `{_api_base}`" extra_information += f"\nAPI Base: `{_api_base}`"
if ( if (

View file

@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "litellm" name = "litellm"
version = "1.41.2" version = "1.41.3"
description = "Library to easily interface with LLM API providers" description = "Library to easily interface with LLM API providers"
authors = ["BerriAI"] authors = ["BerriAI"]
license = "MIT" license = "MIT"
@ -90,7 +90,7 @@ requires = ["poetry-core", "wheel"]
build-backend = "poetry.core.masonry.api" build-backend = "poetry.core.masonry.api"
[tool.commitizen] [tool.commitizen]
version = "1.41.2" version = "1.41.3"
version_files = [ version_files = [
"pyproject.toml:^version" "pyproject.toml:^version"
] ]