LiteLLM Minor fixes + improvements (08/04/2024) (#5505)

* Minor IAM AWS OIDC Improvements (#5246)

* AWS IAM: Temporary tokens are valid across all regions after being issued, so it is wasteful to request one for each region.

* AWS IAM: Include an inline policy, to help reduce misuse of overly permissive IAM roles.

* (test_bedrock_completion.py): Ensure we are testing cross AWS region OIDC flow.

* fix(router.py): log rejected requests

Fixes https://github.com/BerriAI/litellm/issues/5498

* refactor: don't use verbose_logger.exception, if exception is raised

User might already have handling for this. But alerting systems in prod will raise this as an unhandled error.

* fix(datadog.py): support setting datadog source as an env var

Fixes https://github.com/BerriAI/litellm/issues/5508

* docs(logging.md): add dd_source to datadog docs

* fix(proxy_server.py): expose `/customer/list` endpoint for showing all customers

* (bedrock): Fix usage with Cloudflare AI Gateway, and proxies in general. (#5509)

* feat(anthropic.py): support 'cache_control' param for content when it is a string

* Revert "(bedrock): Fix usage with Cloudflare AI Gateway, and proxies in gener…" (#5519)

This reverts commit 3fac0349c2.

* refactor: ci/cd run again

---------

Co-authored-by: David Manouchehri <david.manouchehri@ai.moda>
This commit is contained in:
Krish Dholakia 2024-09-04 22:16:55 -07:00 committed by GitHub
parent cdc312d51d
commit 1e7e538261
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
24 changed files with 383 additions and 247 deletions

View file

@ -1,12 +1,12 @@
repos:
- repo: local
hooks:
- id: mypy
name: mypy
entry: python3 -m mypy --ignore-missing-imports
language: system
types: [python]
files: ^litellm/
# - id: mypy
# name: mypy
# entry: python3 -m mypy --ignore-missing-imports
# language: system
# types: [python]
# files: ^litellm/
- id: isort
name: isort
entry: isort

View file

@ -1426,6 +1426,7 @@ litellm_settings:
```shell
DD_API_KEY="5f2d0f310***********" # your datadog API Key
DD_SITE="us5.datadoghq.com" # your datadog base url
DD_SOURCE="litellm_dev" # [OPTIONAL] your datadog source. use to differentiate dev vs. prod deployments
```
**Step 3**: Start the proxy, make a test request

View file

@ -2039,10 +2039,7 @@ class DualCache(BaseCache):
return result
except Exception as e:
verbose_logger.exception(
f"LiteLLM Cache: Excepton async add_cache: {str(e)}"
)
raise e
raise e # don't log if exception is raised
async def async_set_cache_sadd(
self, key, value: List, local_only: bool = False, **kwargs
@ -2069,10 +2066,7 @@ class DualCache(BaseCache):
return None
except Exception as e:
verbose_logger.exception(
"LiteLLM Cache: Excepton async set_cache_sadd: {}".format(str(e))
)
raise e
raise e # don't log, if exception is raised
def flush_cache(self):
if self.in_memory_cache is not None:
@ -2543,7 +2537,6 @@ class Cache:
self.cache.set_cache(cache_key, cached_data, **kwargs)
except Exception as e:
verbose_logger.exception(f"LiteLLM Cache: Excepton add_cache: {str(e)}")
pass
async def async_add_cache(self, result, *args, **kwargs):
"""

View file

@ -235,10 +235,7 @@ class BraintrustLogger(CustomLogger):
except httpx.HTTPStatusError as e:
raise Exception(e.response.text)
except Exception as e:
verbose_logger.exception(
"Error logging to braintrust - Exception received - {}".format(str(e))
)
raise e
raise e # don't use verbose_logger.exception, if exception is raised
async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
verbose_logger.debug("REACHES BRAINTRUST SUCCESS")
@ -360,10 +357,7 @@ class BraintrustLogger(CustomLogger):
except httpx.HTTPStatusError as e:
raise Exception(e.response.text)
except Exception as e:
verbose_logger.exception(
"Error logging to braintrust - Exception received - {}".format(str(e))
)
raise e
raise e # don't use verbose_logger.exception, if exception is raised
def log_failure_event(self, kwargs, response_obj, start_time, end_time):
return super().log_failure_event(kwargs, response_obj, start_time, end_time)

View file

@ -1,11 +1,17 @@
#### What this does ####
# On success + failure, log events to Datadog
import dotenv, os
import requests # type: ignore
import datetime
import os
import subprocess
import sys
import traceback
import datetime, subprocess, sys
import litellm, uuid
import uuid
import dotenv
import requests # type: ignore
import litellm
from litellm._logging import print_verbose, verbose_logger
@ -57,9 +63,9 @@ class DataDogLogger:
):
try:
# Define DataDog client
from datadog_api_client.v2.api.logs_api import LogsApi
from datadog_api_client.v2 import ApiClient
from datadog_api_client.v2.models import HTTPLogItem, HTTPLog
from datadog_api_client.v2.api.logs_api import LogsApi
from datadog_api_client.v2.models import HTTPLog, HTTPLogItem
verbose_logger.debug(
f"datadog Logging - Enters logging function for model {kwargs}"
@ -131,7 +137,7 @@ class DataDogLogger:
body = HTTPLog(
[
HTTPLogItem(
ddsource="litellm",
ddsource=os.getenv("DD_SOURCE", "litellm"),
message=payload,
service="litellm-server",
),

View file

@ -228,6 +228,54 @@ class AnthropicConfig:
return False
def translate_system_message(
self, messages: List[AllMessageValues]
) -> List[AnthropicSystemMessageContent]:
system_prompt_indices = []
anthropic_system_message_list: List[AnthropicSystemMessageContent] = []
for idx, message in enumerate(messages):
if message["role"] == "system":
valid_content: bool = False
system_message_block = ChatCompletionSystemMessage(**message)
if isinstance(system_message_block["content"], str):
anthropic_system_message_content = AnthropicSystemMessageContent(
type="text",
text=system_message_block["content"],
)
if "cache_control" in system_message_block:
anthropic_system_message_content["cache_control"] = (
system_message_block["cache_control"]
)
anthropic_system_message_list.append(
anthropic_system_message_content
)
valid_content = True
elif isinstance(message["content"], list):
for _content in message["content"]:
anthropic_system_message_content = (
AnthropicSystemMessageContent(
type=_content.get("type"),
text=_content.get("text"),
)
)
if "cache_control" in _content:
anthropic_system_message_content["cache_control"] = (
_content["cache_control"]
)
anthropic_system_message_list.append(
anthropic_system_message_content
)
valid_content = True
if valid_content:
system_prompt_indices.append(idx)
if len(system_prompt_indices) > 0:
for idx in reversed(system_prompt_indices):
messages.pop(idx)
return anthropic_system_message_list
### FOR [BETA] `/v1/messages` endpoint support
def translatable_anthropic_params(self) -> List:
@ -314,7 +362,7 @@ class AnthropicConfig:
new_messages.append(user_message)
if len(new_user_content_list) > 0:
new_messages.append({"role": "user", "content": new_user_content_list})
new_messages.append({"role": "user", "content": new_user_content_list}) # type: ignore
if len(tool_message_list) > 0:
new_messages.extend(tool_message_list)
@ -940,45 +988,11 @@ class AnthropicChatCompletion(BaseLLM):
)
else:
# Separate system prompt from rest of message
system_prompt_indices = []
system_prompt = ""
anthropic_system_message_list = None
for idx, message in enumerate(messages):
if message["role"] == "system":
valid_content: bool = False
if isinstance(message["content"], str):
system_prompt += message["content"]
valid_content = True
elif isinstance(message["content"], list):
for _content in message["content"]:
anthropic_system_message_content = (
AnthropicSystemMessageContent(
type=_content.get("type"),
text=_content.get("text"),
)
)
if "cache_control" in _content:
anthropic_system_message_content["cache_control"] = (
_content["cache_control"]
)
if anthropic_system_message_list is None:
anthropic_system_message_list = []
anthropic_system_message_list.append(
anthropic_system_message_content
)
valid_content = True
if valid_content:
system_prompt_indices.append(idx)
if len(system_prompt_indices) > 0:
for idx in reversed(system_prompt_indices):
messages.pop(idx)
if len(system_prompt) > 0:
optional_params["system"] = system_prompt
anthropic_system_message_list = AnthropicConfig().translate_system_message(
messages=messages
)
# Handling anthropic API Prompt Caching
if anthropic_system_message_list is not None:
if len(anthropic_system_message_list) > 0:
optional_params["system"] = anthropic_system_message_list
# Format rest of message according to anthropic guidelines
try:
@ -986,15 +1000,10 @@ class AnthropicChatCompletion(BaseLLM):
model=model, messages=messages, custom_llm_provider="anthropic"
)
except Exception as e:
verbose_logger.exception(
"litellm.llms.anthropic.chat.py::completion() - Exception occurred - {}\nReceived Messages: {}".format(
str(e), messages
)
)
raise AnthropicError(
status_code=400,
message="{}\nReceived Messages={}".format(str(e), messages),
)
) # don't use verbose_logger.exception, if exception is raised
## Load Config
config = litellm.AnthropicConfig.get_config()

View file

@ -119,8 +119,6 @@ class BaseAWSLLM(BaseLLM):
"aws_web_identity_token": aws_web_identity_token,
"aws_role_name": aws_role_name,
"aws_session_name": aws_session_name,
"aws_region_name": aws_region_name,
"aws_sts_endpoint": sts_endpoint,
}
)
@ -147,6 +145,7 @@ class BaseAWSLLM(BaseLLM):
RoleSessionName=aws_session_name,
WebIdentityToken=oidc_token,
DurationSeconds=3600,
Policy='{"Version":"2012-10-17","Statement":[{"Sid":"BedrockLiteLLM","Effect":"Allow","Action":["bedrock:InvokeModel","bedrock:InvokeModelWithResponseStream"],"Resource":"*","Condition":{"Bool":{"aws:SecureTransport":"true"},"StringLike":{"aws:UserAgent":"litellm/*"}}}]}',
)
iam_creds_dict = {
@ -164,6 +163,11 @@ class BaseAWSLLM(BaseLLM):
ttl=3600 - 60,
)
if sts_response["PackedPolicySize"] > 75:
verbose_logger.warning(
f"The policy size is greater than 75% of the allowed size, PackedPolicySize: {sts_response['PackedPolicySize']}"
)
session = boto3.Session(**iam_creds_dict)
iam_creds = session.get_credentials()

View file

@ -423,13 +423,7 @@ async def ollama_async_streaming(url, data, model_response, encoding, logging_ob
async for transformed_chunk in streamwrapper:
yield transformed_chunk
except Exception as e:
verbose_logger.exception(
"LiteLLM.ollama.py::ollama_async_streaming(): Exception occured - {}".format(
str(e)
)
)
raise e
raise e # don't use verbose_logger.exception, if exception is raised
async def ollama_acompletion(
@ -498,12 +492,7 @@ async def ollama_acompletion(
)
return model_response
except Exception as e:
verbose_logger.exception(
"LiteLLM.ollama.py::ollama_acompletion(): Exception occured - {}".format(
str(e)
)
)
raise e
raise e # don't use verbose_logger.exception, if exception is raised
async def ollama_aembeddings(

View file

@ -583,8 +583,4 @@ async def ollama_acompletion(
)
return model_response
except Exception as e:
verbose_logger.exception(
"LiteLLM.ollama_acompletion(): Exception occured - {}".format(str(e))
)
raise e
raise e # don't use verbose_logger.exception, if exception is raised

View file

@ -168,9 +168,6 @@ def completion(
choices_list.append(choice_obj)
model_response.choices = choices_list # type: ignore
except Exception as e:
verbose_logger.exception(
"litellm.llms.palm.py::completion(): Exception occured - {}".format(str(e))
)
raise PalmError(
message=traceback.format_exc(), status_code=response.status_code
)

View file

@ -564,12 +564,9 @@ class PredibaseChatCompletion(BaseLLM):
for exception in litellm.LITELLM_EXCEPTION_TYPES:
if isinstance(e, exception):
raise e
verbose_logger.exception(
"litellm.llms.predibase.py::async_completion() - Exception occurred - {}".format(
str(e)
)
)
raise PredibaseError(status_code=500, message="{}".format(str(e)))
raise PredibaseError(
status_code=500, message="{}".format(str(e))
) # don't use verbose_logger.exception, if exception is raised
return self.process_response(
model=model,
response=response,

View file

@ -27,10 +27,13 @@ from litellm.types.completion import (
from litellm.types.llms.anthropic import *
from litellm.types.llms.bedrock import MessageBlock as BedrockMessageBlock
from litellm.types.llms.openai import (
AllMessageValues,
ChatCompletionAssistantMessage,
ChatCompletionAssistantToolCall,
ChatCompletionFunctionMessage,
ChatCompletionToolCallFunctionChunk,
ChatCompletionToolMessage,
ChatCompletionUserMessage,
)
from litellm.types.utils import GenericImageParsingChunk
@ -493,10 +496,9 @@ def hf_chat_template(model: str, messages: list, chat_template: Optional[Any] =
return rendered_text
except Exception as e:
verbose_logger.exception(
"Error rendering huggingface chat template - {}".format(str(e))
)
raise Exception(f"Error rendering template - {str(e)}")
raise Exception(
f"Error rendering template - {str(e)}"
) # don't use verbose_logger.exception, if exception is raised
# Anthropic template
@ -1171,7 +1173,9 @@ def convert_to_gemini_tool_call_result(
return _part
def convert_to_anthropic_tool_result(message: dict) -> AnthropicMessagesToolResultParam:
def convert_to_anthropic_tool_result(
message: Union[dict, ChatCompletionToolMessage, ChatCompletionFunctionMessage]
) -> AnthropicMessagesToolResultParam:
"""
OpenAI message with a tool result looks like:
{
@ -1215,7 +1219,7 @@ def convert_to_anthropic_tool_result(message: dict) -> AnthropicMessagesToolResu
return anthropic_tool_result
if message["role"] == "function":
content = message.get("content") # type: ignore
tool_call_id = message.get("tool_call_id") or str(uuid.uuid4())
tool_call_id = message.get("tool_call_id") or str(uuid.uuid4()) # type: ignore
anthropic_tool_result = AnthropicMessagesToolResultParam(
type="tool_result", tool_use_id=tool_call_id, content=content
)
@ -1230,7 +1234,7 @@ def convert_to_anthropic_tool_result(message: dict) -> AnthropicMessagesToolResu
def convert_function_to_anthropic_tool_invoke(
function_call,
function_call: Union[dict, ChatCompletionToolCallFunctionChunk],
) -> List[AnthropicMessagesToolUseParam]:
try:
anthropic_tool_invoke = [
@ -1247,7 +1251,7 @@ def convert_function_to_anthropic_tool_invoke(
def convert_to_anthropic_tool_invoke(
tool_calls: list,
tool_calls: List[ChatCompletionAssistantToolCall],
) -> List[AnthropicMessagesToolUseParam]:
"""
OpenAI tool invokes:
@ -1307,17 +1311,19 @@ def add_cache_control_to_content(
anthropic_content_element: Union[
dict, AnthropicMessagesImageParam, AnthropicMessagesTextParam
],
orignal_content_element: dict,
orignal_content_element: Union[dict, AllMessageValues],
):
if "cache_control" in orignal_content_element:
anthropic_content_element["cache_control"] = orignal_content_element[
"cache_control"
]
cache_control_param = orignal_content_element.get("cache_control")
if cache_control_param is not None and isinstance(cache_control_param, dict):
transformed_param = ChatCompletionCachedContent(**cache_control_param) # type: ignore
anthropic_content_element["cache_control"] = transformed_param
return anthropic_content_element
def anthropic_messages_pt(
messages: list,
messages: List[AllMessageValues],
model: str,
llm_provider: str,
) -> List[
@ -1348,10 +1354,21 @@ def anthropic_messages_pt(
while msg_i < len(messages):
user_content: List[AnthropicMessagesUserMessageValues] = []
init_msg_i = msg_i
if isinstance(messages[msg_i], BaseModel):
messages[msg_i] = dict(messages[msg_i]) # type: ignore
## MERGE CONSECUTIVE USER CONTENT ##
while msg_i < len(messages) and messages[msg_i]["role"] in user_message_types:
if isinstance(messages[msg_i]["content"], list):
for m in messages[msg_i]["content"]:
user_message_types_block: Union[
ChatCompletionToolMessage,
ChatCompletionUserMessage,
ChatCompletionFunctionMessage,
] = messages[
msg_i
] # type: ignore
if user_message_types_block["content"] and isinstance(
user_message_types_block["content"], list
):
for m in user_message_types_block["content"]:
if m.get("type", "") == "image_url":
image_chunk = convert_to_anthropic_image_obj(
m["image_url"]["url"]
@ -1382,15 +1399,24 @@ def anthropic_messages_pt(
)
user_content.append(anthropic_content_element)
elif (
messages[msg_i]["role"] == "tool"
or messages[msg_i]["role"] == "function"
user_message_types_block["role"] == "tool"
or user_message_types_block["role"] == "function"
):
# OpenAI's tool message content will always be a string
user_content.append(convert_to_anthropic_tool_result(messages[msg_i]))
else:
user_content.append(
{"type": "text", "text": messages[msg_i]["content"]}
convert_to_anthropic_tool_result(user_message_types_block)
)
elif isinstance(user_message_types_block["content"], str):
_anthropic_content_text_element: AnthropicMessagesTextParam = {
"type": "text",
"text": user_message_types_block["content"],
}
anthropic_content_element = add_cache_control_to_content(
anthropic_content_element=_anthropic_content_text_element,
orignal_content_element=user_message_types_block,
)
user_content.append(anthropic_content_element)
msg_i += 1
@ -1400,10 +1426,11 @@ def anthropic_messages_pt(
assistant_content: List[AnthropicMessagesAssistantMessageValues] = []
## MERGE CONSECUTIVE ASSISTANT CONTENT ##
while msg_i < len(messages) and messages[msg_i]["role"] == "assistant":
if "content" in messages[msg_i] and isinstance(
messages[msg_i]["content"], list
assistant_content_block: ChatCompletionAssistantMessage = messages[msg_i] # type: ignore
if "content" in assistant_content_block and isinstance(
assistant_content_block["content"], list
):
for m in messages[msg_i]["content"]:
for m in assistant_content_block["content"]:
# handle text
if (
m.get("type", "") == "text" and len(m.get("text", "")) > 0
@ -1417,35 +1444,37 @@ def anthropic_messages_pt(
)
assistant_content.append(anthropic_message)
elif (
"content" in messages[msg_i]
and isinstance(messages[msg_i]["content"], str)
and len(messages[msg_i]["content"])
> 0 # don't pass empty text blocks. anthropic api raises errors.
"content" in assistant_content_block
and isinstance(assistant_content_block["content"], str)
and assistant_content_block[
"content"
] # don't pass empty text blocks. anthropic api raises errors.
):
_anthropic_text_content_element = {
"type": "text",
"text": messages[msg_i]["content"],
"text": assistant_content_block["content"],
}
anthropic_content_element = add_cache_control_to_content(
anthropic_content_element=_anthropic_text_content_element,
orignal_content_element=messages[msg_i],
orignal_content_element=assistant_content_block,
)
assistant_content.append(anthropic_content_element)
if messages[msg_i].get(
"tool_calls", []
assistant_tool_calls = assistant_content_block.get("tool_calls")
if (
assistant_tool_calls is not None
): # support assistant tool invoke conversion
assistant_content.extend(
convert_to_anthropic_tool_invoke(messages[msg_i]["tool_calls"])
convert_to_anthropic_tool_invoke(assistant_tool_calls)
)
if messages[msg_i].get("function_call"):
assistant_function_call = assistant_content_block.get("function_call")
if assistant_function_call is not None:
assistant_content.extend(
convert_function_to_anthropic_tool_invoke(
messages[msg_i]["function_call"]
)
convert_function_to_anthropic_tool_invoke(assistant_function_call)
)
msg_i += 1

View file

@ -491,14 +491,9 @@ class CodestralTextCompletion(BaseLLM):
message="HTTPStatusError - {}".format(e.response.text),
)
except Exception as e:
verbose_logger.exception(
"litellm.llms.text_completion_codestral.py::async_completion() - Exception occurred - {}".format(
str(e)
)
)
raise TextCompletionCodestralError(
status_code=500, message="{}".format(str(e))
)
) # don't use verbose_logger.exception, if exception is raised
return self.process_text_completion_response(
model=model,
response=response,

View file

@ -445,9 +445,6 @@ async def acompletion(
) # sets the logging event loop if the user does sync streaming (e.g. on proxy for sagemaker calls)
return response
except Exception as e:
verbose_logger.exception(
"litellm.main.py::acompletion() - Exception occurred - {}".format(str(e))
)
custom_llm_provider = custom_llm_provider or "openai"
raise exception_type(
model=model,
@ -616,9 +613,6 @@ def mock_completion(
except Exception as e:
if isinstance(e, openai.APIError):
raise e
verbose_logger.exception(
"litellm.mock_completion(): Exception occured - {}".format(str(e))
)
raise Exception("Mock completion response failed")
@ -5125,9 +5119,6 @@ async def ahealth_check(
response = {} # args like remaining ratelimit etc.
return response
except Exception as e:
verbose_logger.exception(
"litellm.ahealth_check(): Exception occured - {}".format(str(e))
)
stack_trace = traceback.format_exc()
if isinstance(stack_trace, str):
stack_trace = stack_trace[:1000]

View file

@ -1,6 +1,16 @@
model_list:
- model_name: "whisper"
litellm_params:
model: "azure/azure-whisper"
api_key: os.environ/AZURE_EUROPE_API_KEY
api_base: "https://my-endpoint-europe-berri-992.openai.azure.com/"
- model_name: gpt-4o-mini-2024-07-18
litellm_params:
api_key: API_KEY
model: openai/gpt-4o-mini-2024-07-18
rpm: 0
tpm: 100
router_settings:
num_retries: 0
routing_strategy: usage-based-routing-v2
timeout: 10
litellm_settings:
callbacks: custom_callbacks.proxy_handler_instance

View file

@ -386,7 +386,6 @@ async def user_api_key_auth(
parent_otel_span=parent_otel_span,
)
#### ELSE ####
## CHECK PASS-THROUGH ENDPOINTS ##
if pass_through_endpoints is not None:
for endpoint in pass_through_endpoints:

View file

@ -1,66 +1,10 @@
from litellm.integrations.custom_logger import CustomLogger
import litellm
# This file includes the custom callbacks for LiteLLM Proxy
# Once defined, these can be passed in proxy_config.yaml
class MyCustomHandler(CustomLogger):
def log_pre_api_call(self, model, messages, kwargs):
print(f"Pre-API Call") # noqa
def log_post_api_call(self, kwargs, response_obj, start_time, end_time):
print(f"Post-API Call") # noqa
def log_stream_event(self, kwargs, response_obj, start_time, end_time):
print(f"On Stream") # noqa
def log_success_event(self, kwargs, response_obj, start_time, end_time):
print("On Success") # noqa
def log_failure_event(self, kwargs, response_obj, start_time, end_time):
print(f"On Failure") # noqa
async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
print(f"ishaan async_log_success_event") # noqa
# log: key, user, model, prompt, response, tokens, cost
# Access kwargs passed to litellm.completion()
model = kwargs.get("model", None)
messages = kwargs.get("messages", None)
user = kwargs.get("user", None)
# Access litellm_params passed to litellm.completion(), example access `metadata`
litellm_params = kwargs.get("litellm_params", {})
metadata = litellm_params.get(
"metadata", {}
) # headers passed to LiteLLM proxy, can be found here
return
async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time):
try:
print(f"On Async Failure !") # noqa
print("\nkwargs", kwargs) # noqa
# Access kwargs passed to litellm.completion()
model = kwargs.get("model", None)
messages = kwargs.get("messages", None)
user = kwargs.get("user", None)
# Access litellm_params passed to litellm.completion(), example access `metadata`
litellm_params = kwargs.get("litellm_params", {})
metadata = litellm_params.get(
"metadata", {}
) # headers passed to LiteLLM proxy, can be found here
# Acess Exceptions & Traceback
exception_event = kwargs.get("exception", None)
traceback_event = kwargs.get("traceback_exception", None)
# Calculate cost using litellm.completion_cost()
except Exception as e:
print(f"Exception: {e}") # noqa
# print("Call failed")
pass
proxy_handler_instance = MyCustomHandler()
# Set litellm.callbacks = [proxy_handler_instance] on the proxy
# need to set litellm.callbacks = [proxy_handler_instance] # on the proxy

View file

@ -6183,6 +6183,64 @@ async def delete_end_user(
pass
@router.get(
"/customer/list",
tags=["Customer Management"],
dependencies=[Depends(user_api_key_auth)],
response_model=List[LiteLLM_EndUserTable],
)
@router.get(
"/end_user/list",
tags=["Customer Management"],
include_in_schema=False,
dependencies=[Depends(user_api_key_auth)],
)
async def list_team(
http_request: Request,
user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
):
"""
[Admin-only] List all available customers
```
curl --location --request GET 'http://0.0.0.0:4000/customer/list' \
--header 'Authorization: Bearer sk-1234'
```
"""
from litellm.proxy.proxy_server import (
_duration_in_seconds,
create_audit_log_for_update,
litellm_proxy_admin_name,
prisma_client,
)
if (
user_api_key_dict.user_role != LitellmUserRoles.PROXY_ADMIN
and user_api_key_dict.user_role != LitellmUserRoles.PROXY_ADMIN_VIEW_ONLY
):
raise HTTPException(
status_code=401,
detail={
"error": "Admin-only endpoint. Your user role={}".format(
user_api_key_dict.user_role
)
},
)
if prisma_client is None:
raise HTTPException(
status_code=400,
detail={"error": CommonProxyErrors.db_not_connected_error.value},
)
response = await prisma_client.db.litellm_endusertable.find_many()
returned_response: List[LiteLLM_EndUserTable] = []
for item in response:
returned_response.append(LiteLLM_EndUserTable(**item.model_dump()))
return returned_response
async def create_audit_log_for_update(request_data: LiteLLM_AuditLogs):
if premium_user is not True:
return

View file

@ -47,6 +47,7 @@ from litellm._logging import verbose_router_logger
from litellm.assistants.main import AssistantDeleted
from litellm.caching import DualCache, InMemoryCache, RedisCache
from litellm.integrations.custom_logger import CustomLogger
from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLogging
from litellm.llms.azure import get_azure_ad_token_from_oidc
from litellm.router_strategy.least_busy import LeastBusyLoggingHandler
from litellm.router_strategy.lowest_cost import LowestCostLoggingHandler
@ -783,6 +784,10 @@ class Router:
}
)
logging_obj: Optional[LiteLLMLogging] = kwargs.get(
"litellm_logging_obj", None
)
rpm_semaphore = self._get_client(
deployment=deployment,
kwargs=kwargs,
@ -797,11 +802,13 @@ class Router:
- If allowed, increment the rpm limit (allows global value to be updated, concurrency-safe)
"""
await self.async_routing_strategy_pre_call_checks(
deployment=deployment
deployment=deployment, logging_obj=logging_obj
)
response = await _response
else:
await self.async_routing_strategy_pre_call_checks(deployment=deployment)
await self.async_routing_strategy_pre_call_checks(
deployment=deployment, logging_obj=logging_obj
)
response = await _response
## CHECK CONTENT FILTER ERROR ##
@ -3860,7 +3867,9 @@ class Router:
if isinstance(_callback, CustomLogger):
response = _callback.pre_call_check(deployment)
async def async_routing_strategy_pre_call_checks(self, deployment: dict):
async def async_routing_strategy_pre_call_checks(
self, deployment: dict, logging_obj: Optional[LiteLLMLogging] = None
):
"""
For usage-based-routing-v2, enables running rpm checks before the call is made, inside the semaphore.
@ -3875,8 +3884,22 @@ class Router:
for _callback in litellm.callbacks:
if isinstance(_callback, CustomLogger):
try:
response = await _callback.async_pre_call_check(deployment)
_ = await _callback.async_pre_call_check(deployment)
except litellm.RateLimitError as e:
## LOG FAILURE EVENT
if logging_obj is not None:
asyncio.create_task(
logging_obj.async_failure_handler(
exception=e,
traceback_exception=traceback.format_exc(),
end_time=time.time(),
)
)
## LOGGING
threading.Thread(
target=logging_obj.failure_handler,
args=(e, traceback.format_exc()),
).start() # log response
self._set_cooldown_deployments(
exception_status=e.status_code,
original_exception=e,
@ -3885,6 +3908,20 @@ class Router:
)
raise e
except Exception as e:
## LOG FAILURE EVENT
if logging_obj is not None:
asyncio.create_task(
logging_obj.async_failure_handler(
exception=e,
traceback_exception=traceback.format_exc(),
end_time=time.time(),
)
)
## LOGGING
threading.Thread(
target=logging_obj.failure_handler,
args=(e, traceback.format_exc()),
).start() # log response
raise e
def _generate_model_id(self, model_group: str, litellm_params: dict):

View file

@ -222,6 +222,94 @@ async def test_anthropic_api_prompt_caching_basic():
)
@pytest.mark.asyncio()
async def test_anthropic_api_prompt_caching_with_content_str():
from litellm.llms.prompt_templates.factory import anthropic_messages_pt
system_message = [
{
"role": "system",
"content": "Here is the full text of a complex legal agreement",
"cache_control": {"type": "ephemeral"},
},
]
translated_system_message = litellm.AnthropicConfig().translate_system_message(
messages=system_message
)
assert translated_system_message == [
# System Message
{
"type": "text",
"text": "Here is the full text of a complex legal agreement",
"cache_control": {"type": "ephemeral"},
}
]
user_messages = [
# marked for caching with the cache_control parameter, so that this checkpoint can read from the previous cache.
{
"role": "user",
"content": "What are the key terms and conditions in this agreement?",
"cache_control": {"type": "ephemeral"},
},
{
"role": "assistant",
"content": "Certainly! the key terms and conditions are the following: the contract is 1 year long for $10/mo",
},
# The final turn is marked with cache-control, for continuing in followups.
{
"role": "user",
"content": "What are the key terms and conditions in this agreement?",
"cache_control": {"type": "ephemeral"},
},
]
translated_messages = anthropic_messages_pt(
messages=user_messages,
model="claude-3-5-sonnet-20240620",
llm_provider="anthropic",
)
expected_messages = [
{
"role": "user",
"content": [
{
"type": "text",
"text": "What are the key terms and conditions in this agreement?",
"cache_control": {"type": "ephemeral"},
}
],
},
{
"role": "assistant",
"content": [
{
"type": "text",
"text": "Certainly! the key terms and conditions are the following: the contract is 1 year long for $10/mo",
}
],
},
# The final turn is marked with cache-control, for continuing in followups.
{
"role": "user",
"content": [
{
"type": "text",
"text": "What are the key terms and conditions in this agreement?",
"cache_control": {"type": "ephemeral"},
}
],
},
]
assert len(translated_messages) == len(expected_messages)
for idx, i in enumerate(translated_messages):
assert (
i == expected_messages[idx]
), "Error on idx={}. Got={}, Expected={}".format(idx, i, expected_messages[idx])
@pytest.mark.asyncio()
async def test_anthropic_api_prompt_caching_no_headers():
litellm.set_verbose = True

View file

@ -616,8 +616,8 @@ def test_completion_bedrock_httpx_command_r_sts_oidc_auth():
aws_region_name=aws_region_name,
aws_web_identity_token=aws_web_identity_token,
aws_role_name=aws_role_name,
aws_session_name="my-test-session",
aws_sts_endpoint="https://sts-fips.us-west-2.amazonaws.com",
aws_session_name="cross-region-test",
aws_sts_endpoint="https://sts-fips.us-east-2.amazonaws.com",
aws_bedrock_runtime_endpoint="https://bedrock-runtime-fips.us-west-2.amazonaws.com",
)
# Add any assertions here to check the response

View file

@ -3,6 +3,8 @@ from typing import Any, Dict, Iterable, List, Optional, Union
from pydantic import BaseModel, validator
from typing_extensions import Literal, Required, TypedDict
from .openai import ChatCompletionCachedContent
class AnthropicMessagesToolChoice(TypedDict, total=False):
type: Required[Literal["auto", "any", "tool"]]
@ -18,7 +20,7 @@ class AnthropicMessagesTool(TypedDict, total=False):
class AnthropicMessagesTextParam(TypedDict, total=False):
type: Literal["text"]
text: str
cache_control: Optional[dict]
cache_control: Optional[Union[dict, ChatCompletionCachedContent]]
class AnthropicMessagesToolUseParam(TypedDict):
@ -58,7 +60,7 @@ class AnthropicImageParamSource(TypedDict):
class AnthropicMessagesImageParam(TypedDict, total=False):
type: Literal["image"]
source: AnthropicImageParamSource
cache_control: Optional[dict]
cache_control: Optional[Union[dict, ChatCompletionCachedContent]]
class AnthropicMessagesToolResultContent(TypedDict):
@ -97,7 +99,7 @@ class AnthropicMetadata(TypedDict, total=False):
class AnthropicSystemMessageContent(TypedDict, total=False):
type: str
text: str
cache_control: Optional[dict]
cache_control: Optional[Union[dict, ChatCompletionCachedContent]]
class AnthropicMessagesRequest(TypedDict, total=False):

View file

@ -354,14 +354,18 @@ class ChatCompletionImageObject(TypedDict):
image_url: ChatCompletionImageUrlObject
class ChatCompletionUserMessage(TypedDict):
class OpenAIChatCompletionUserMessage(TypedDict):
role: Literal["user"]
content: Union[
str, Iterable[Union[ChatCompletionTextObject, ChatCompletionImageObject]]
]
class ChatCompletionAssistantMessage(TypedDict, total=False):
class ChatCompletionUserMessage(OpenAIChatCompletionUserMessage, total=False):
cache_control: ChatCompletionCachedContent
class OpenAIChatCompletionAssistantMessage(TypedDict, total=False):
role: Required[Literal["assistant"]]
content: Optional[Union[str, Iterable[ChatCompletionTextObject]]]
name: Optional[str]
@ -369,6 +373,10 @@ class ChatCompletionAssistantMessage(TypedDict, total=False):
function_call: Optional[ChatCompletionToolCallFunctionChunk]
class ChatCompletionAssistantMessage(OpenAIChatCompletionAssistantMessage, total=False):
cache_control: ChatCompletionCachedContent
class ChatCompletionToolMessage(TypedDict):
role: Literal["tool"]
content: str
@ -381,12 +389,16 @@ class ChatCompletionFunctionMessage(TypedDict):
name: str
class ChatCompletionSystemMessage(TypedDict, total=False):
class OpenAIChatCompletionSystemMessage(TypedDict, total=False):
role: Required[Literal["system"]]
content: Required[Union[str, List]]
name: str
class ChatCompletionSystemMessage(OpenAIChatCompletionSystemMessage, total=False):
cache_control: ChatCompletionCachedContent
AllMessageValues = Union[
ChatCompletionUserMessage,
ChatCompletionAssistantMessage,

View file

@ -8547,11 +8547,6 @@ class CustomStreamWrapper:
"finish_reason": finish_reason,
}
except Exception as e:
verbose_logger.exception(
"litellm.CustomStreamWrapper.handle_predibase_chunk(): Exception occured - {}".format(
str(e)
)
)
raise e
def handle_huggingface_chunk(self, chunk):
@ -8595,11 +8590,6 @@ class CustomStreamWrapper:
"finish_reason": finish_reason,
}
except Exception as e:
verbose_logger.exception(
"litellm.CustomStreamWrapper.handle_huggingface_chunk(): Exception occured - {}".format(
str(e)
)
)
raise e
def handle_ai21_chunk(self, chunk): # fake streaming
@ -8826,11 +8816,6 @@ class CustomStreamWrapper:
"usage": usage,
}
except Exception as e:
verbose_logger.exception(
"litellm.CustomStreamWrapper.handle_openai_chat_completion_chunk(): Exception occured - {}".format(
str(e)
)
)
raise e
def handle_azure_text_completion_chunk(self, chunk):