forked from phoenix/litellm-mirror
Merge branch 'main' into litellm_disable_cooldowns
This commit is contained in:
commit
011e14eb08
14 changed files with 353 additions and 30 deletions
|
@ -20,6 +20,8 @@ This covers:
|
|||
- **Spend Tracking**
|
||||
- ✅ [Tracking Spend for Custom Tags](./proxy/enterprise#tracking-spend-for-custom-tags)
|
||||
- ✅ [API Endpoints to get Spend Reports per Team, API Key, Customer](./proxy/cost_tracking.md#✨-enterprise-api-endpoints-to-get-spend)
|
||||
- **Advanced Metrics**
|
||||
- ✅ [`x-ratelimit-remaining-requests`, `x-ratelimit-remaining-tokens` for LLM APIs on Prometheus](./proxy/prometheus#✨-enterprise-llm-remaining-requests-and-remaining-tokens)
|
||||
- **Guardrails, PII Masking, Content Moderation**
|
||||
- ✅ [Content Moderation with LLM Guard, LlamaGuard, Secret Detection, Google Text Moderations](./proxy/enterprise#content-moderation)
|
||||
- ✅ [Prompt Injection Detection (with LakeraAI API)](./proxy/enterprise#prompt-injection-detection---lakeraai)
|
||||
|
|
|
@ -23,6 +23,8 @@ Features:
|
|||
- **Spend Tracking**
|
||||
- ✅ [Tracking Spend for Custom Tags](#tracking-spend-for-custom-tags)
|
||||
- ✅ [API Endpoints to get Spend Reports per Team, API Key, Customer](cost_tracking.md#✨-enterprise-api-endpoints-to-get-spend)
|
||||
- **Advanced Metrics**
|
||||
- ✅ [`x-ratelimit-remaining-requests`, `x-ratelimit-remaining-tokens` for LLM APIs on Prometheus](prometheus#✨-enterprise-llm-remaining-requests-and-remaining-tokens)
|
||||
- **Guardrails, PII Masking, Content Moderation**
|
||||
- ✅ [Content Moderation with LLM Guard, LlamaGuard, Secret Detection, Google Text Moderations](#content-moderation)
|
||||
- ✅ [Prompt Injection Detection (with LakeraAI API)](#prompt-injection-detection---lakeraai)
|
||||
|
|
|
@ -1188,6 +1188,7 @@ litellm_settings:
|
|||
s3_region_name: us-west-2 # AWS Region Name for S3
|
||||
s3_aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID # us os.environ/<variable name> to pass environment variables. This is AWS Access Key ID for S3
|
||||
s3_aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY # AWS Secret Access Key for S3
|
||||
s3_path: my-test-path # [OPTIONAL] set path in bucket you want to write logs to
|
||||
s3_endpoint_url: https://s3.amazonaws.com # [OPTIONAL] S3 endpoint URL, if you want to use Backblaze/cloudflare s3 buckets
|
||||
```
|
||||
|
||||
|
|
|
@ -1,3 +1,6 @@
|
|||
import Tabs from '@theme/Tabs';
|
||||
import TabItem from '@theme/TabItem';
|
||||
|
||||
# 📈 Prometheus metrics [BETA]
|
||||
|
||||
LiteLLM Exposes a `/metrics` endpoint for Prometheus to Poll
|
||||
|
@ -61,6 +64,56 @@ http://localhost:4000/metrics
|
|||
| `litellm_remaining_api_key_budget_metric` | Remaining Budget for API Key (A key Created on LiteLLM)|
|
||||
|
||||
|
||||
### ✨ (Enterprise) LLM Remaining Requests and Remaining Tokens
|
||||
Set this on your config.yaml to allow you to track how close you are to hitting your TPM / RPM limits on each model group
|
||||
|
||||
```yaml
|
||||
litellm_settings:
|
||||
success_callback: ["prometheus"]
|
||||
failure_callback: ["prometheus"]
|
||||
return_response_headers: true # ensures the LLM API calls track the response headers
|
||||
```
|
||||
|
||||
| Metric Name | Description |
|
||||
|----------------------|--------------------------------------|
|
||||
| `litellm_remaining_requests_metric` | Track `x-ratelimit-remaining-requests` returned from LLM API Deployment |
|
||||
| `litellm_remaining_tokens` | Track `x-ratelimit-remaining-tokens` return from LLM API Deployment |
|
||||
|
||||
Example Metric
|
||||
<Tabs>
|
||||
|
||||
<TabItem value="Remaining Requests" label="Remaining Requests">
|
||||
|
||||
```shell
|
||||
litellm_remaining_requests
|
||||
{
|
||||
api_base="https://api.openai.com/v1",
|
||||
api_provider="openai",
|
||||
litellm_model_name="gpt-3.5-turbo",
|
||||
model_group="gpt-3.5-turbo"
|
||||
}
|
||||
8998.0
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
|
||||
<TabItem value="Requests" label="Remaining Tokens">
|
||||
|
||||
```shell
|
||||
litellm_remaining_tokens
|
||||
{
|
||||
api_base="https://api.openai.com/v1",
|
||||
api_provider="openai",
|
||||
litellm_model_name="gpt-3.5-turbo",
|
||||
model_group="gpt-3.5-turbo"
|
||||
}
|
||||
999981.0
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
|
||||
</Tabs>
|
||||
|
||||
## Monitor System Health
|
||||
|
||||
To monitor the health of litellm adjacent services (redis / postgres), do:
|
||||
|
|
|
@ -125,6 +125,9 @@ llm_guard_mode: Literal["all", "key-specific", "request-specific"] = "all"
|
|||
##################
|
||||
### PREVIEW FEATURES ###
|
||||
enable_preview_features: bool = False
|
||||
return_response_headers: bool = (
|
||||
False # get response headers from LLM Api providers - example x-remaining-requests,
|
||||
)
|
||||
##################
|
||||
logging: bool = True
|
||||
caching: bool = (
|
||||
|
|
|
@ -2,14 +2,20 @@
|
|||
#### What this does ####
|
||||
# On success, log events to Prometheus
|
||||
|
||||
import dotenv, os
|
||||
import requests # type: ignore
|
||||
import datetime
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import traceback
|
||||
import datetime, subprocess, sys
|
||||
import litellm, uuid
|
||||
from litellm._logging import print_verbose, verbose_logger
|
||||
import uuid
|
||||
from typing import Optional, Union
|
||||
|
||||
import dotenv
|
||||
import requests # type: ignore
|
||||
|
||||
import litellm
|
||||
from litellm._logging import print_verbose, verbose_logger
|
||||
|
||||
|
||||
class PrometheusLogger:
|
||||
# Class variables or attributes
|
||||
|
@ -20,6 +26,8 @@ class PrometheusLogger:
|
|||
try:
|
||||
from prometheus_client import Counter, Gauge
|
||||
|
||||
from litellm.proxy.proxy_server import premium_user
|
||||
|
||||
self.litellm_llm_api_failed_requests_metric = Counter(
|
||||
name="litellm_llm_api_failed_requests_metric",
|
||||
documentation="Total number of failed LLM API calls via litellm",
|
||||
|
@ -88,6 +96,31 @@ class PrometheusLogger:
|
|||
labelnames=["hashed_api_key", "api_key_alias"],
|
||||
)
|
||||
|
||||
# Litellm-Enterprise Metrics
|
||||
if premium_user is True:
|
||||
# Remaining Rate Limit for model
|
||||
self.litellm_remaining_requests_metric = Gauge(
|
||||
"litellm_remaining_requests",
|
||||
"remaining requests for model, returned from LLM API Provider",
|
||||
labelnames=[
|
||||
"model_group",
|
||||
"api_provider",
|
||||
"api_base",
|
||||
"litellm_model_name",
|
||||
],
|
||||
)
|
||||
|
||||
self.litellm_remaining_tokens_metric = Gauge(
|
||||
"litellm_remaining_tokens",
|
||||
"remaining tokens for model, returned from LLM API Provider",
|
||||
labelnames=[
|
||||
"model_group",
|
||||
"api_provider",
|
||||
"api_base",
|
||||
"litellm_model_name",
|
||||
],
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
print_verbose(f"Got exception on init prometheus client {str(e)}")
|
||||
raise e
|
||||
|
@ -104,6 +137,8 @@ class PrometheusLogger:
|
|||
):
|
||||
try:
|
||||
# Define prometheus client
|
||||
from litellm.proxy.proxy_server import premium_user
|
||||
|
||||
verbose_logger.debug(
|
||||
f"prometheus Logging - Enters logging function for model {kwargs}"
|
||||
)
|
||||
|
@ -199,6 +234,10 @@ class PrometheusLogger:
|
|||
user_api_key, user_api_key_alias
|
||||
).set(_remaining_api_key_budget)
|
||||
|
||||
# set x-ratelimit headers
|
||||
if premium_user is True:
|
||||
self.set_remaining_tokens_requests_metric(kwargs)
|
||||
|
||||
### FAILURE INCREMENT ###
|
||||
if "exception" in kwargs:
|
||||
self.litellm_llm_api_failed_requests_metric.labels(
|
||||
|
@ -216,6 +255,58 @@ class PrometheusLogger:
|
|||
verbose_logger.debug(traceback.format_exc())
|
||||
pass
|
||||
|
||||
def set_remaining_tokens_requests_metric(self, request_kwargs: dict):
|
||||
try:
|
||||
verbose_logger.debug("setting remaining tokens requests metric")
|
||||
_response_headers = request_kwargs.get("response_headers")
|
||||
_litellm_params = request_kwargs.get("litellm_params", {}) or {}
|
||||
_metadata = _litellm_params.get("metadata", {})
|
||||
litellm_model_name = request_kwargs.get("model", None)
|
||||
model_group = _metadata.get("model_group", None)
|
||||
api_base = _metadata.get("api_base", None)
|
||||
llm_provider = _litellm_params.get("custom_llm_provider", None)
|
||||
|
||||
remaining_requests = None
|
||||
remaining_tokens = None
|
||||
# OpenAI / OpenAI Compatible headers
|
||||
if (
|
||||
_response_headers
|
||||
and "x-ratelimit-remaining-requests" in _response_headers
|
||||
):
|
||||
remaining_requests = _response_headers["x-ratelimit-remaining-requests"]
|
||||
if (
|
||||
_response_headers
|
||||
and "x-ratelimit-remaining-tokens" in _response_headers
|
||||
):
|
||||
remaining_tokens = _response_headers["x-ratelimit-remaining-tokens"]
|
||||
verbose_logger.debug(
|
||||
f"remaining requests: {remaining_requests}, remaining tokens: {remaining_tokens}"
|
||||
)
|
||||
|
||||
if remaining_requests:
|
||||
"""
|
||||
"model_group",
|
||||
"api_provider",
|
||||
"api_base",
|
||||
"litellm_model_name"
|
||||
"""
|
||||
self.litellm_remaining_requests_metric.labels(
|
||||
model_group, llm_provider, api_base, litellm_model_name
|
||||
).set(remaining_requests)
|
||||
|
||||
if remaining_tokens:
|
||||
self.litellm_remaining_tokens_metric.labels(
|
||||
model_group, llm_provider, api_base, litellm_model_name
|
||||
).set(remaining_tokens)
|
||||
|
||||
except Exception as e:
|
||||
verbose_logger.error(
|
||||
"Prometheus Error: set_remaining_tokens_requests_metric. Exception occured - {}".format(
|
||||
str(e)
|
||||
)
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
def safe_get_remaining_budget(
|
||||
max_budget: Optional[float], spend: Optional[float]
|
||||
|
|
|
@ -1,10 +1,14 @@
|
|||
#### What this does ####
|
||||
# On success + failure, log events to Supabase
|
||||
|
||||
import datetime
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import traceback
|
||||
import datetime, subprocess, sys
|
||||
import litellm, uuid
|
||||
import uuid
|
||||
|
||||
import litellm
|
||||
from litellm._logging import print_verbose, verbose_logger
|
||||
|
||||
|
||||
|
@ -54,6 +58,7 @@ class S3Logger:
|
|||
"s3_aws_session_token"
|
||||
)
|
||||
s3_config = litellm.s3_callback_params.get("s3_config")
|
||||
s3_path = litellm.s3_callback_params.get("s3_path")
|
||||
# done reading litellm.s3_callback_params
|
||||
|
||||
self.bucket_name = s3_bucket_name
|
||||
|
|
|
@ -23,6 +23,7 @@ from typing_extensions import overload
|
|||
import litellm
|
||||
from litellm import OpenAIConfig
|
||||
from litellm.caching import DualCache
|
||||
from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
|
||||
from litellm.utils import (
|
||||
Choices,
|
||||
CustomStreamWrapper,
|
||||
|
@ -458,6 +459,36 @@ class AzureChatCompletion(BaseLLM):
|
|||
|
||||
return azure_client
|
||||
|
||||
async def make_azure_openai_chat_completion_request(
|
||||
self,
|
||||
azure_client: AsyncAzureOpenAI,
|
||||
data: dict,
|
||||
timeout: Union[float, httpx.Timeout],
|
||||
):
|
||||
"""
|
||||
Helper to:
|
||||
- call chat.completions.create.with_raw_response when litellm.return_response_headers is True
|
||||
- call chat.completions.create by default
|
||||
"""
|
||||
try:
|
||||
if litellm.return_response_headers is True:
|
||||
raw_response = (
|
||||
await azure_client.chat.completions.with_raw_response.create(
|
||||
**data, timeout=timeout
|
||||
)
|
||||
)
|
||||
|
||||
headers = dict(raw_response.headers)
|
||||
response = raw_response.parse()
|
||||
return headers, response
|
||||
else:
|
||||
response = await azure_client.chat.completions.create(
|
||||
**data, timeout=timeout
|
||||
)
|
||||
return None, response
|
||||
except Exception as e:
|
||||
raise e
|
||||
|
||||
def completion(
|
||||
self,
|
||||
model: str,
|
||||
|
@ -470,7 +501,7 @@ class AzureChatCompletion(BaseLLM):
|
|||
azure_ad_token: str,
|
||||
print_verbose: Callable,
|
||||
timeout: Union[float, httpx.Timeout],
|
||||
logging_obj,
|
||||
logging_obj: LiteLLMLoggingObj,
|
||||
optional_params,
|
||||
litellm_params,
|
||||
logger_fn,
|
||||
|
@ -649,9 +680,9 @@ class AzureChatCompletion(BaseLLM):
|
|||
data: dict,
|
||||
timeout: Any,
|
||||
model_response: ModelResponse,
|
||||
logging_obj: LiteLLMLoggingObj,
|
||||
azure_ad_token: Optional[str] = None,
|
||||
client=None, # this is the AsyncAzureOpenAI
|
||||
logging_obj=None,
|
||||
):
|
||||
response = None
|
||||
try:
|
||||
|
@ -701,9 +732,13 @@ class AzureChatCompletion(BaseLLM):
|
|||
"complete_input_dict": data,
|
||||
},
|
||||
)
|
||||
response = await azure_client.chat.completions.create(
|
||||
**data, timeout=timeout
|
||||
|
||||
headers, response = await self.make_azure_openai_chat_completion_request(
|
||||
azure_client=azure_client,
|
||||
data=data,
|
||||
timeout=timeout,
|
||||
)
|
||||
logging_obj.model_call_details["response_headers"] = headers
|
||||
|
||||
stringified_response = response.model_dump()
|
||||
logging_obj.post_call(
|
||||
|
@ -812,7 +847,7 @@ class AzureChatCompletion(BaseLLM):
|
|||
|
||||
async def async_streaming(
|
||||
self,
|
||||
logging_obj,
|
||||
logging_obj: LiteLLMLoggingObj,
|
||||
api_base: str,
|
||||
api_key: str,
|
||||
api_version: str,
|
||||
|
@ -861,9 +896,14 @@ class AzureChatCompletion(BaseLLM):
|
|||
"complete_input_dict": data,
|
||||
},
|
||||
)
|
||||
response = await azure_client.chat.completions.create(
|
||||
**data, timeout=timeout
|
||||
|
||||
headers, response = await self.make_azure_openai_chat_completion_request(
|
||||
azure_client=azure_client,
|
||||
data=data,
|
||||
timeout=timeout,
|
||||
)
|
||||
logging_obj.model_call_details["response_headers"] = headers
|
||||
|
||||
# return response
|
||||
streamwrapper = CustomStreamWrapper(
|
||||
completion_stream=response,
|
||||
|
|
|
@ -21,6 +21,7 @@ from pydantic import BaseModel
|
|||
from typing_extensions import overload, override
|
||||
|
||||
import litellm
|
||||
from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
|
||||
from litellm.types.utils import ProviderField
|
||||
from litellm.utils import (
|
||||
Choices,
|
||||
|
@ -652,6 +653,36 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
else:
|
||||
return client
|
||||
|
||||
async def make_openai_chat_completion_request(
|
||||
self,
|
||||
openai_aclient: AsyncOpenAI,
|
||||
data: dict,
|
||||
timeout: Union[float, httpx.Timeout],
|
||||
):
|
||||
"""
|
||||
Helper to:
|
||||
- call chat.completions.create.with_raw_response when litellm.return_response_headers is True
|
||||
- call chat.completions.create by default
|
||||
"""
|
||||
try:
|
||||
if litellm.return_response_headers is True:
|
||||
raw_response = (
|
||||
await openai_aclient.chat.completions.with_raw_response.create(
|
||||
**data, timeout=timeout
|
||||
)
|
||||
)
|
||||
|
||||
headers = dict(raw_response.headers)
|
||||
response = raw_response.parse()
|
||||
return headers, response
|
||||
else:
|
||||
response = await openai_aclient.chat.completions.create(
|
||||
**data, timeout=timeout
|
||||
)
|
||||
return None, response
|
||||
except Exception as e:
|
||||
raise e
|
||||
|
||||
def completion(
|
||||
self,
|
||||
model_response: ModelResponse,
|
||||
|
@ -836,13 +867,13 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
self,
|
||||
data: dict,
|
||||
model_response: ModelResponse,
|
||||
logging_obj: LiteLLMLoggingObj,
|
||||
timeout: Union[float, httpx.Timeout],
|
||||
api_key: Optional[str] = None,
|
||||
api_base: Optional[str] = None,
|
||||
organization: Optional[str] = None,
|
||||
client=None,
|
||||
max_retries=None,
|
||||
logging_obj=None,
|
||||
headers=None,
|
||||
):
|
||||
response = None
|
||||
|
@ -869,8 +900,8 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
},
|
||||
)
|
||||
|
||||
response = await openai_aclient.chat.completions.create(
|
||||
**data, timeout=timeout
|
||||
headers, response = await self.make_openai_chat_completion_request(
|
||||
openai_aclient=openai_aclient, data=data, timeout=timeout
|
||||
)
|
||||
stringified_response = response.model_dump()
|
||||
logging_obj.post_call(
|
||||
|
@ -879,9 +910,11 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
original_response=stringified_response,
|
||||
additional_args={"complete_input_dict": data},
|
||||
)
|
||||
logging_obj.model_call_details["response_headers"] = headers
|
||||
return convert_to_model_response_object(
|
||||
response_object=stringified_response,
|
||||
model_response_object=model_response,
|
||||
hidden_params={"headers": headers},
|
||||
)
|
||||
except Exception as e:
|
||||
raise e
|
||||
|
@ -931,10 +964,10 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
|
||||
async def async_streaming(
|
||||
self,
|
||||
logging_obj,
|
||||
timeout: Union[float, httpx.Timeout],
|
||||
data: dict,
|
||||
model: str,
|
||||
logging_obj: LiteLLMLoggingObj,
|
||||
api_key: Optional[str] = None,
|
||||
api_base: Optional[str] = None,
|
||||
organization: Optional[str] = None,
|
||||
|
@ -965,9 +998,10 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
},
|
||||
)
|
||||
|
||||
response = await openai_aclient.chat.completions.create(
|
||||
**data, timeout=timeout
|
||||
headers, response = await self.make_openai_chat_completion_request(
|
||||
openai_aclient=openai_aclient, data=data, timeout=timeout
|
||||
)
|
||||
logging_obj.model_call_details["response_headers"] = headers
|
||||
streamwrapper = CustomStreamWrapper(
|
||||
completion_stream=response,
|
||||
model=model,
|
||||
|
@ -992,17 +1026,43 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
else:
|
||||
raise OpenAIError(status_code=500, message=f"{str(e)}")
|
||||
|
||||
# Embedding
|
||||
async def make_openai_embedding_request(
|
||||
self,
|
||||
openai_aclient: AsyncOpenAI,
|
||||
data: dict,
|
||||
timeout: Union[float, httpx.Timeout],
|
||||
):
|
||||
"""
|
||||
Helper to:
|
||||
- call embeddings.create.with_raw_response when litellm.return_response_headers is True
|
||||
- call embeddings.create by default
|
||||
"""
|
||||
try:
|
||||
if litellm.return_response_headers is True:
|
||||
raw_response = await openai_aclient.embeddings.with_raw_response.create(
|
||||
**data, timeout=timeout
|
||||
) # type: ignore
|
||||
headers = dict(raw_response.headers)
|
||||
response = raw_response.parse()
|
||||
return headers, response
|
||||
else:
|
||||
response = await openai_aclient.embeddings.create(**data, timeout=timeout) # type: ignore
|
||||
return None, response
|
||||
except Exception as e:
|
||||
raise e
|
||||
|
||||
async def aembedding(
|
||||
self,
|
||||
input: list,
|
||||
data: dict,
|
||||
model_response: litellm.utils.EmbeddingResponse,
|
||||
timeout: float,
|
||||
logging_obj: LiteLLMLoggingObj,
|
||||
api_key: Optional[str] = None,
|
||||
api_base: Optional[str] = None,
|
||||
client: Optional[AsyncOpenAI] = None,
|
||||
max_retries=None,
|
||||
logging_obj=None,
|
||||
):
|
||||
response = None
|
||||
try:
|
||||
|
@ -1014,7 +1074,10 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
max_retries=max_retries,
|
||||
client=client,
|
||||
)
|
||||
response = await openai_aclient.embeddings.create(**data, timeout=timeout) # type: ignore
|
||||
headers, response = await self.make_openai_embedding_request(
|
||||
openai_aclient=openai_aclient, data=data, timeout=timeout
|
||||
)
|
||||
logging_obj.model_call_details["response_headers"] = headers
|
||||
stringified_response = response.model_dump()
|
||||
## LOGGING
|
||||
logging_obj.post_call(
|
||||
|
@ -1229,6 +1292,34 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
else:
|
||||
raise OpenAIError(status_code=500, message=str(e))
|
||||
|
||||
# Audio Transcriptions
|
||||
async def make_openai_audio_transcriptions_request(
|
||||
self,
|
||||
openai_aclient: AsyncOpenAI,
|
||||
data: dict,
|
||||
timeout: Union[float, httpx.Timeout],
|
||||
):
|
||||
"""
|
||||
Helper to:
|
||||
- call openai_aclient.audio.transcriptions.with_raw_response when litellm.return_response_headers is True
|
||||
- call openai_aclient.audio.transcriptions.create by default
|
||||
"""
|
||||
try:
|
||||
if litellm.return_response_headers is True:
|
||||
raw_response = (
|
||||
await openai_aclient.audio.transcriptions.with_raw_response.create(
|
||||
**data, timeout=timeout
|
||||
)
|
||||
) # type: ignore
|
||||
headers = dict(raw_response.headers)
|
||||
response = raw_response.parse()
|
||||
return headers, response
|
||||
else:
|
||||
response = await openai_aclient.audio.transcriptions.create(**data, timeout=timeout) # type: ignore
|
||||
return None, response
|
||||
except Exception as e:
|
||||
raise e
|
||||
|
||||
def audio_transcriptions(
|
||||
self,
|
||||
model: str,
|
||||
|
@ -1286,11 +1377,11 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
data: dict,
|
||||
model_response: TranscriptionResponse,
|
||||
timeout: float,
|
||||
logging_obj: LiteLLMLoggingObj,
|
||||
api_key: Optional[str] = None,
|
||||
api_base: Optional[str] = None,
|
||||
client=None,
|
||||
max_retries=None,
|
||||
logging_obj=None,
|
||||
):
|
||||
try:
|
||||
openai_aclient = self._get_openai_client(
|
||||
|
@ -1302,9 +1393,12 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
client=client,
|
||||
)
|
||||
|
||||
response = await openai_aclient.audio.transcriptions.create(
|
||||
**data, timeout=timeout
|
||||
) # type: ignore
|
||||
headers, response = await self.make_openai_audio_transcriptions_request(
|
||||
openai_aclient=openai_aclient,
|
||||
data=data,
|
||||
timeout=timeout,
|
||||
)
|
||||
logging_obj.model_call_details["response_headers"] = headers
|
||||
stringified_response = response.model_dump()
|
||||
## LOGGING
|
||||
logging_obj.post_call(
|
||||
|
@ -1497,9 +1591,9 @@ class OpenAITextCompletion(BaseLLM):
|
|||
model: str,
|
||||
messages: list,
|
||||
timeout: float,
|
||||
logging_obj: LiteLLMLoggingObj,
|
||||
print_verbose: Optional[Callable] = None,
|
||||
api_base: Optional[str] = None,
|
||||
logging_obj=None,
|
||||
acompletion: bool = False,
|
||||
optional_params=None,
|
||||
litellm_params=None,
|
||||
|
|
|
@ -36,6 +36,7 @@ general_settings:
|
|||
LANGFUSE_SECRET_KEY: "os.environ/LANGFUSE_DEV_SK_KEY"
|
||||
|
||||
litellm_settings:
|
||||
return_response_headers: true
|
||||
success_callback: ["prometheus"]
|
||||
callbacks: ["otel", "hide_secrets"]
|
||||
failure_callback: ["prometheus"]
|
||||
|
|
|
@ -23,7 +23,7 @@ from litellm import RateLimitError, Timeout, completion, completion_cost, embedd
|
|||
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
|
||||
from litellm.llms.prompt_templates.factory import anthropic_messages_pt
|
||||
|
||||
# litellm.num_retries = 3
|
||||
# litellm.num_retries=3
|
||||
litellm.cache = None
|
||||
litellm.success_callback = []
|
||||
user_message = "Write a short poem about the sky"
|
||||
|
|
|
@ -249,6 +249,25 @@ def test_completion_azure_exception():
|
|||
# test_completion_azure_exception()
|
||||
|
||||
|
||||
def test_azure_embedding_exceptions():
|
||||
try:
|
||||
|
||||
response = litellm.embedding(
|
||||
model="azure/azure-embedding-model",
|
||||
input="hello",
|
||||
messages="hello",
|
||||
)
|
||||
pytest.fail(f"Bad request this should have failed but got {response}")
|
||||
|
||||
except Exception as e:
|
||||
print(vars(e))
|
||||
# CRUCIAL Test - Ensures our exceptions are readable and not overly complicated. some users have complained exceptions will randomly have another exception raised in our exception mapping
|
||||
assert (
|
||||
e.message
|
||||
== "litellm.APIError: AzureException APIError - Embeddings.create() got an unexpected keyword argument 'messages'"
|
||||
)
|
||||
|
||||
|
||||
async def asynctest_completion_azure_exception():
|
||||
try:
|
||||
import openai
|
||||
|
|
|
@ -5810,6 +5810,18 @@ def exception_type(
|
|||
_model_group = _metadata.get("model_group")
|
||||
_deployment = _metadata.get("deployment")
|
||||
extra_information = f"\nModel: {model}"
|
||||
|
||||
exception_provider = "Unknown"
|
||||
if (
|
||||
isinstance(custom_llm_provider, str)
|
||||
and len(custom_llm_provider) > 0
|
||||
):
|
||||
exception_provider = (
|
||||
custom_llm_provider[0].upper()
|
||||
+ custom_llm_provider[1:]
|
||||
+ "Exception"
|
||||
)
|
||||
|
||||
if _api_base:
|
||||
extra_information += f"\nAPI Base: `{_api_base}`"
|
||||
if (
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
[tool.poetry]
|
||||
name = "litellm"
|
||||
version = "1.41.2"
|
||||
version = "1.41.3"
|
||||
description = "Library to easily interface with LLM API providers"
|
||||
authors = ["BerriAI"]
|
||||
license = "MIT"
|
||||
|
@ -90,7 +90,7 @@ requires = ["poetry-core", "wheel"]
|
|||
build-backend = "poetry.core.masonry.api"
|
||||
|
||||
[tool.commitizen]
|
||||
version = "1.41.2"
|
||||
version = "1.41.3"
|
||||
version_files = [
|
||||
"pyproject.toml:^version"
|
||||
]
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue