diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a33473b72..d429bc6b8 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,12 +1,12 @@ repos: - repo: local hooks: - - id: mypy - name: mypy - entry: python3 -m mypy --ignore-missing-imports - language: system - types: [python] - files: ^litellm/ + # - id: mypy + # name: mypy + # entry: python3 -m mypy --ignore-missing-imports + # language: system + # types: [python] + # files: ^litellm/ - id: isort name: isort entry: isort diff --git a/docs/my-website/docs/proxy/logging.md b/docs/my-website/docs/proxy/logging.md index 82a7c37db..9492920d0 100644 --- a/docs/my-website/docs/proxy/logging.md +++ b/docs/my-website/docs/proxy/logging.md @@ -1426,6 +1426,7 @@ litellm_settings: ```shell DD_API_KEY="5f2d0f310***********" # your datadog API Key DD_SITE="us5.datadoghq.com" # your datadog base url +DD_SOURCE="litellm_dev" # [OPTIONAL] your datadog source. use to differentiate dev vs. prod deployments ``` **Step 3**: Start the proxy, make a test request diff --git a/litellm/caching.py b/litellm/caching.py index db2f93507..13da3cb1e 100644 --- a/litellm/caching.py +++ b/litellm/caching.py @@ -2039,10 +2039,7 @@ class DualCache(BaseCache): return result except Exception as e: - verbose_logger.exception( - f"LiteLLM Cache: Excepton async add_cache: {str(e)}" - ) - raise e + raise e # don't log if exception is raised async def async_set_cache_sadd( self, key, value: List, local_only: bool = False, **kwargs @@ -2069,10 +2066,7 @@ class DualCache(BaseCache): return None except Exception as e: - verbose_logger.exception( - "LiteLLM Cache: Excepton async set_cache_sadd: {}".format(str(e)) - ) - raise e + raise e # don't log, if exception is raised def flush_cache(self): if self.in_memory_cache is not None: @@ -2543,7 +2537,6 @@ class Cache: self.cache.set_cache(cache_key, cached_data, **kwargs) except Exception as e: verbose_logger.exception(f"LiteLLM Cache: Excepton add_cache: {str(e)}") - pass async def async_add_cache(self, result, *args, **kwargs): """ diff --git a/litellm/integrations/braintrust_logging.py b/litellm/integrations/braintrust_logging.py index 3e1c429de..5128fcc49 100644 --- a/litellm/integrations/braintrust_logging.py +++ b/litellm/integrations/braintrust_logging.py @@ -235,10 +235,7 @@ class BraintrustLogger(CustomLogger): except httpx.HTTPStatusError as e: raise Exception(e.response.text) except Exception as e: - verbose_logger.exception( - "Error logging to braintrust - Exception received - {}".format(str(e)) - ) - raise e + raise e # don't use verbose_logger.exception, if exception is raised async def async_log_success_event(self, kwargs, response_obj, start_time, end_time): verbose_logger.debug("REACHES BRAINTRUST SUCCESS") @@ -360,10 +357,7 @@ class BraintrustLogger(CustomLogger): except httpx.HTTPStatusError as e: raise Exception(e.response.text) except Exception as e: - verbose_logger.exception( - "Error logging to braintrust - Exception received - {}".format(str(e)) - ) - raise e + raise e # don't use verbose_logger.exception, if exception is raised def log_failure_event(self, kwargs, response_obj, start_time, end_time): return super().log_failure_event(kwargs, response_obj, start_time, end_time) diff --git a/litellm/integrations/datadog.py b/litellm/integrations/datadog.py index f3170e446..e6ff29e04 100644 --- a/litellm/integrations/datadog.py +++ b/litellm/integrations/datadog.py @@ -1,11 +1,17 @@ #### What this does #### # On success + failure, log events to Datadog -import dotenv, os -import requests # type: ignore +import datetime +import os +import subprocess +import sys import traceback -import datetime, subprocess, sys -import litellm, uuid +import uuid + +import dotenv +import requests # type: ignore + +import litellm from litellm._logging import print_verbose, verbose_logger @@ -57,9 +63,9 @@ class DataDogLogger: ): try: # Define DataDog client - from datadog_api_client.v2.api.logs_api import LogsApi from datadog_api_client.v2 import ApiClient - from datadog_api_client.v2.models import HTTPLogItem, HTTPLog + from datadog_api_client.v2.api.logs_api import LogsApi + from datadog_api_client.v2.models import HTTPLog, HTTPLogItem verbose_logger.debug( f"datadog Logging - Enters logging function for model {kwargs}" @@ -131,7 +137,7 @@ class DataDogLogger: body = HTTPLog( [ HTTPLogItem( - ddsource="litellm", + ddsource=os.getenv("DD_SOURCE", "litellm"), message=payload, service="litellm-server", ), diff --git a/litellm/llms/anthropic/chat.py b/litellm/llms/anthropic/chat.py index c3ad03859..dd7ab58c1 100644 --- a/litellm/llms/anthropic/chat.py +++ b/litellm/llms/anthropic/chat.py @@ -228,6 +228,54 @@ class AnthropicConfig: return False + def translate_system_message( + self, messages: List[AllMessageValues] + ) -> List[AnthropicSystemMessageContent]: + system_prompt_indices = [] + anthropic_system_message_list: List[AnthropicSystemMessageContent] = [] + for idx, message in enumerate(messages): + if message["role"] == "system": + valid_content: bool = False + system_message_block = ChatCompletionSystemMessage(**message) + if isinstance(system_message_block["content"], str): + anthropic_system_message_content = AnthropicSystemMessageContent( + type="text", + text=system_message_block["content"], + ) + if "cache_control" in system_message_block: + anthropic_system_message_content["cache_control"] = ( + system_message_block["cache_control"] + ) + anthropic_system_message_list.append( + anthropic_system_message_content + ) + valid_content = True + elif isinstance(message["content"], list): + for _content in message["content"]: + anthropic_system_message_content = ( + AnthropicSystemMessageContent( + type=_content.get("type"), + text=_content.get("text"), + ) + ) + if "cache_control" in _content: + anthropic_system_message_content["cache_control"] = ( + _content["cache_control"] + ) + + anthropic_system_message_list.append( + anthropic_system_message_content + ) + valid_content = True + + if valid_content: + system_prompt_indices.append(idx) + if len(system_prompt_indices) > 0: + for idx in reversed(system_prompt_indices): + messages.pop(idx) + + return anthropic_system_message_list + ### FOR [BETA] `/v1/messages` endpoint support def translatable_anthropic_params(self) -> List: @@ -314,7 +362,7 @@ class AnthropicConfig: new_messages.append(user_message) if len(new_user_content_list) > 0: - new_messages.append({"role": "user", "content": new_user_content_list}) + new_messages.append({"role": "user", "content": new_user_content_list}) # type: ignore if len(tool_message_list) > 0: new_messages.extend(tool_message_list) @@ -940,45 +988,11 @@ class AnthropicChatCompletion(BaseLLM): ) else: # Separate system prompt from rest of message - system_prompt_indices = [] - system_prompt = "" - anthropic_system_message_list = None - for idx, message in enumerate(messages): - if message["role"] == "system": - valid_content: bool = False - if isinstance(message["content"], str): - system_prompt += message["content"] - valid_content = True - elif isinstance(message["content"], list): - for _content in message["content"]: - anthropic_system_message_content = ( - AnthropicSystemMessageContent( - type=_content.get("type"), - text=_content.get("text"), - ) - ) - if "cache_control" in _content: - anthropic_system_message_content["cache_control"] = ( - _content["cache_control"] - ) - - if anthropic_system_message_list is None: - anthropic_system_message_list = [] - anthropic_system_message_list.append( - anthropic_system_message_content - ) - valid_content = True - - if valid_content: - system_prompt_indices.append(idx) - if len(system_prompt_indices) > 0: - for idx in reversed(system_prompt_indices): - messages.pop(idx) - if len(system_prompt) > 0: - optional_params["system"] = system_prompt - + anthropic_system_message_list = AnthropicConfig().translate_system_message( + messages=messages + ) # Handling anthropic API Prompt Caching - if anthropic_system_message_list is not None: + if len(anthropic_system_message_list) > 0: optional_params["system"] = anthropic_system_message_list # Format rest of message according to anthropic guidelines try: @@ -986,15 +1000,10 @@ class AnthropicChatCompletion(BaseLLM): model=model, messages=messages, custom_llm_provider="anthropic" ) except Exception as e: - verbose_logger.exception( - "litellm.llms.anthropic.chat.py::completion() - Exception occurred - {}\nReceived Messages: {}".format( - str(e), messages - ) - ) raise AnthropicError( status_code=400, message="{}\nReceived Messages={}".format(str(e), messages), - ) + ) # don't use verbose_logger.exception, if exception is raised ## Load Config config = litellm.AnthropicConfig.get_config() diff --git a/litellm/llms/base_aws_llm.py b/litellm/llms/base_aws_llm.py index 7449dc2d7..70f333eb6 100644 --- a/litellm/llms/base_aws_llm.py +++ b/litellm/llms/base_aws_llm.py @@ -119,8 +119,6 @@ class BaseAWSLLM(BaseLLM): "aws_web_identity_token": aws_web_identity_token, "aws_role_name": aws_role_name, "aws_session_name": aws_session_name, - "aws_region_name": aws_region_name, - "aws_sts_endpoint": sts_endpoint, } ) @@ -147,6 +145,7 @@ class BaseAWSLLM(BaseLLM): RoleSessionName=aws_session_name, WebIdentityToken=oidc_token, DurationSeconds=3600, + Policy='{"Version":"2012-10-17","Statement":[{"Sid":"BedrockLiteLLM","Effect":"Allow","Action":["bedrock:InvokeModel","bedrock:InvokeModelWithResponseStream"],"Resource":"*","Condition":{"Bool":{"aws:SecureTransport":"true"},"StringLike":{"aws:UserAgent":"litellm/*"}}}]}', ) iam_creds_dict = { @@ -164,6 +163,11 @@ class BaseAWSLLM(BaseLLM): ttl=3600 - 60, ) + if sts_response["PackedPolicySize"] > 75: + verbose_logger.warning( + f"The policy size is greater than 75% of the allowed size, PackedPolicySize: {sts_response['PackedPolicySize']}" + ) + session = boto3.Session(**iam_creds_dict) iam_creds = session.get_credentials() diff --git a/litellm/llms/ollama.py b/litellm/llms/ollama.py index 9f62bab20..70678af64 100644 --- a/litellm/llms/ollama.py +++ b/litellm/llms/ollama.py @@ -423,13 +423,7 @@ async def ollama_async_streaming(url, data, model_response, encoding, logging_ob async for transformed_chunk in streamwrapper: yield transformed_chunk except Exception as e: - verbose_logger.exception( - "LiteLLM.ollama.py::ollama_async_streaming(): Exception occured - {}".format( - str(e) - ) - ) - - raise e + raise e # don't use verbose_logger.exception, if exception is raised async def ollama_acompletion( @@ -498,12 +492,7 @@ async def ollama_acompletion( ) return model_response except Exception as e: - verbose_logger.exception( - "LiteLLM.ollama.py::ollama_acompletion(): Exception occured - {}".format( - str(e) - ) - ) - raise e + raise e # don't use verbose_logger.exception, if exception is raised async def ollama_aembeddings( diff --git a/litellm/llms/ollama_chat.py b/litellm/llms/ollama_chat.py index 7c4cf7b37..2fc44f9cd 100644 --- a/litellm/llms/ollama_chat.py +++ b/litellm/llms/ollama_chat.py @@ -583,8 +583,4 @@ async def ollama_acompletion( ) return model_response except Exception as e: - verbose_logger.exception( - "LiteLLM.ollama_acompletion(): Exception occured - {}".format(str(e)) - ) - - raise e + raise e # don't use verbose_logger.exception, if exception is raised diff --git a/litellm/llms/palm.py b/litellm/llms/palm.py index a17fd02be..f6297e627 100644 --- a/litellm/llms/palm.py +++ b/litellm/llms/palm.py @@ -168,9 +168,6 @@ def completion( choices_list.append(choice_obj) model_response.choices = choices_list # type: ignore except Exception as e: - verbose_logger.exception( - "litellm.llms.palm.py::completion(): Exception occured - {}".format(str(e)) - ) raise PalmError( message=traceback.format_exc(), status_code=response.status_code ) diff --git a/litellm/llms/predibase.py b/litellm/llms/predibase.py index 84e2810a5..81e28934d 100644 --- a/litellm/llms/predibase.py +++ b/litellm/llms/predibase.py @@ -564,12 +564,9 @@ class PredibaseChatCompletion(BaseLLM): for exception in litellm.LITELLM_EXCEPTION_TYPES: if isinstance(e, exception): raise e - verbose_logger.exception( - "litellm.llms.predibase.py::async_completion() - Exception occurred - {}".format( - str(e) - ) - ) - raise PredibaseError(status_code=500, message="{}".format(str(e))) + raise PredibaseError( + status_code=500, message="{}".format(str(e)) + ) # don't use verbose_logger.exception, if exception is raised return self.process_response( model=model, response=response, diff --git a/litellm/llms/prompt_templates/factory.py b/litellm/llms/prompt_templates/factory.py index a1894d87f..d2b9db037 100644 --- a/litellm/llms/prompt_templates/factory.py +++ b/litellm/llms/prompt_templates/factory.py @@ -27,10 +27,13 @@ from litellm.types.completion import ( from litellm.types.llms.anthropic import * from litellm.types.llms.bedrock import MessageBlock as BedrockMessageBlock from litellm.types.llms.openai import ( + AllMessageValues, ChatCompletionAssistantMessage, + ChatCompletionAssistantToolCall, ChatCompletionFunctionMessage, ChatCompletionToolCallFunctionChunk, ChatCompletionToolMessage, + ChatCompletionUserMessage, ) from litellm.types.utils import GenericImageParsingChunk @@ -493,10 +496,9 @@ def hf_chat_template(model: str, messages: list, chat_template: Optional[Any] = return rendered_text except Exception as e: - verbose_logger.exception( - "Error rendering huggingface chat template - {}".format(str(e)) - ) - raise Exception(f"Error rendering template - {str(e)}") + raise Exception( + f"Error rendering template - {str(e)}" + ) # don't use verbose_logger.exception, if exception is raised # Anthropic template @@ -1171,7 +1173,9 @@ def convert_to_gemini_tool_call_result( return _part -def convert_to_anthropic_tool_result(message: dict) -> AnthropicMessagesToolResultParam: +def convert_to_anthropic_tool_result( + message: Union[dict, ChatCompletionToolMessage, ChatCompletionFunctionMessage] +) -> AnthropicMessagesToolResultParam: """ OpenAI message with a tool result looks like: { @@ -1215,7 +1219,7 @@ def convert_to_anthropic_tool_result(message: dict) -> AnthropicMessagesToolResu return anthropic_tool_result if message["role"] == "function": content = message.get("content") # type: ignore - tool_call_id = message.get("tool_call_id") or str(uuid.uuid4()) + tool_call_id = message.get("tool_call_id") or str(uuid.uuid4()) # type: ignore anthropic_tool_result = AnthropicMessagesToolResultParam( type="tool_result", tool_use_id=tool_call_id, content=content ) @@ -1230,7 +1234,7 @@ def convert_to_anthropic_tool_result(message: dict) -> AnthropicMessagesToolResu def convert_function_to_anthropic_tool_invoke( - function_call, + function_call: Union[dict, ChatCompletionToolCallFunctionChunk], ) -> List[AnthropicMessagesToolUseParam]: try: anthropic_tool_invoke = [ @@ -1247,7 +1251,7 @@ def convert_function_to_anthropic_tool_invoke( def convert_to_anthropic_tool_invoke( - tool_calls: list, + tool_calls: List[ChatCompletionAssistantToolCall], ) -> List[AnthropicMessagesToolUseParam]: """ OpenAI tool invokes: @@ -1307,17 +1311,19 @@ def add_cache_control_to_content( anthropic_content_element: Union[ dict, AnthropicMessagesImageParam, AnthropicMessagesTextParam ], - orignal_content_element: dict, + orignal_content_element: Union[dict, AllMessageValues], ): - if "cache_control" in orignal_content_element: - anthropic_content_element["cache_control"] = orignal_content_element[ - "cache_control" - ] + cache_control_param = orignal_content_element.get("cache_control") + if cache_control_param is not None and isinstance(cache_control_param, dict): + transformed_param = ChatCompletionCachedContent(**cache_control_param) # type: ignore + + anthropic_content_element["cache_control"] = transformed_param + return anthropic_content_element def anthropic_messages_pt( - messages: list, + messages: List[AllMessageValues], model: str, llm_provider: str, ) -> List[ @@ -1348,10 +1354,21 @@ def anthropic_messages_pt( while msg_i < len(messages): user_content: List[AnthropicMessagesUserMessageValues] = [] init_msg_i = msg_i + if isinstance(messages[msg_i], BaseModel): + messages[msg_i] = dict(messages[msg_i]) # type: ignore ## MERGE CONSECUTIVE USER CONTENT ## while msg_i < len(messages) and messages[msg_i]["role"] in user_message_types: - if isinstance(messages[msg_i]["content"], list): - for m in messages[msg_i]["content"]: + user_message_types_block: Union[ + ChatCompletionToolMessage, + ChatCompletionUserMessage, + ChatCompletionFunctionMessage, + ] = messages[ + msg_i + ] # type: ignore + if user_message_types_block["content"] and isinstance( + user_message_types_block["content"], list + ): + for m in user_message_types_block["content"]: if m.get("type", "") == "image_url": image_chunk = convert_to_anthropic_image_obj( m["image_url"]["url"] @@ -1382,15 +1399,24 @@ def anthropic_messages_pt( ) user_content.append(anthropic_content_element) elif ( - messages[msg_i]["role"] == "tool" - or messages[msg_i]["role"] == "function" + user_message_types_block["role"] == "tool" + or user_message_types_block["role"] == "function" ): # OpenAI's tool message content will always be a string - user_content.append(convert_to_anthropic_tool_result(messages[msg_i])) - else: user_content.append( - {"type": "text", "text": messages[msg_i]["content"]} + convert_to_anthropic_tool_result(user_message_types_block) ) + elif isinstance(user_message_types_block["content"], str): + _anthropic_content_text_element: AnthropicMessagesTextParam = { + "type": "text", + "text": user_message_types_block["content"], + } + anthropic_content_element = add_cache_control_to_content( + anthropic_content_element=_anthropic_content_text_element, + orignal_content_element=user_message_types_block, + ) + + user_content.append(anthropic_content_element) msg_i += 1 @@ -1400,10 +1426,11 @@ def anthropic_messages_pt( assistant_content: List[AnthropicMessagesAssistantMessageValues] = [] ## MERGE CONSECUTIVE ASSISTANT CONTENT ## while msg_i < len(messages) and messages[msg_i]["role"] == "assistant": - if "content" in messages[msg_i] and isinstance( - messages[msg_i]["content"], list + assistant_content_block: ChatCompletionAssistantMessage = messages[msg_i] # type: ignore + if "content" in assistant_content_block and isinstance( + assistant_content_block["content"], list ): - for m in messages[msg_i]["content"]: + for m in assistant_content_block["content"]: # handle text if ( m.get("type", "") == "text" and len(m.get("text", "")) > 0 @@ -1417,35 +1444,37 @@ def anthropic_messages_pt( ) assistant_content.append(anthropic_message) elif ( - "content" in messages[msg_i] - and isinstance(messages[msg_i]["content"], str) - and len(messages[msg_i]["content"]) - > 0 # don't pass empty text blocks. anthropic api raises errors. + "content" in assistant_content_block + and isinstance(assistant_content_block["content"], str) + and assistant_content_block[ + "content" + ] # don't pass empty text blocks. anthropic api raises errors. ): _anthropic_text_content_element = { "type": "text", - "text": messages[msg_i]["content"], + "text": assistant_content_block["content"], } anthropic_content_element = add_cache_control_to_content( anthropic_content_element=_anthropic_text_content_element, - orignal_content_element=messages[msg_i], + orignal_content_element=assistant_content_block, ) assistant_content.append(anthropic_content_element) - if messages[msg_i].get( - "tool_calls", [] + assistant_tool_calls = assistant_content_block.get("tool_calls") + if ( + assistant_tool_calls is not None ): # support assistant tool invoke conversion assistant_content.extend( - convert_to_anthropic_tool_invoke(messages[msg_i]["tool_calls"]) + convert_to_anthropic_tool_invoke(assistant_tool_calls) ) - if messages[msg_i].get("function_call"): + assistant_function_call = assistant_content_block.get("function_call") + + if assistant_function_call is not None: assistant_content.extend( - convert_function_to_anthropic_tool_invoke( - messages[msg_i]["function_call"] - ) + convert_function_to_anthropic_tool_invoke(assistant_function_call) ) msg_i += 1 diff --git a/litellm/llms/text_completion_codestral.py b/litellm/llms/text_completion_codestral.py index 9dbe3bb37..9bcd64631 100644 --- a/litellm/llms/text_completion_codestral.py +++ b/litellm/llms/text_completion_codestral.py @@ -491,14 +491,9 @@ class CodestralTextCompletion(BaseLLM): message="HTTPStatusError - {}".format(e.response.text), ) except Exception as e: - verbose_logger.exception( - "litellm.llms.text_completion_codestral.py::async_completion() - Exception occurred - {}".format( - str(e) - ) - ) raise TextCompletionCodestralError( status_code=500, message="{}".format(str(e)) - ) + ) # don't use verbose_logger.exception, if exception is raised return self.process_text_completion_response( model=model, response=response, diff --git a/litellm/main.py b/litellm/main.py index e35014abb..9e7297e11 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -445,9 +445,6 @@ async def acompletion( ) # sets the logging event loop if the user does sync streaming (e.g. on proxy for sagemaker calls) return response except Exception as e: - verbose_logger.exception( - "litellm.main.py::acompletion() - Exception occurred - {}".format(str(e)) - ) custom_llm_provider = custom_llm_provider or "openai" raise exception_type( model=model, @@ -616,9 +613,6 @@ def mock_completion( except Exception as e: if isinstance(e, openai.APIError): raise e - verbose_logger.exception( - "litellm.mock_completion(): Exception occured - {}".format(str(e)) - ) raise Exception("Mock completion response failed") @@ -5125,9 +5119,6 @@ async def ahealth_check( response = {} # args like remaining ratelimit etc. return response except Exception as e: - verbose_logger.exception( - "litellm.ahealth_check(): Exception occured - {}".format(str(e)) - ) stack_trace = traceback.format_exc() if isinstance(stack_trace, str): stack_trace = stack_trace[:1000] diff --git a/litellm/proxy/_new_secret_config.yaml b/litellm/proxy/_new_secret_config.yaml index 6c1078fa8..c015f2085 100644 --- a/litellm/proxy/_new_secret_config.yaml +++ b/litellm/proxy/_new_secret_config.yaml @@ -1,6 +1,16 @@ + model_list: - - model_name: "whisper" - litellm_params: - model: "azure/azure-whisper" - api_key: os.environ/AZURE_EUROPE_API_KEY - api_base: "https://my-endpoint-europe-berri-992.openai.azure.com/" +- model_name: gpt-4o-mini-2024-07-18 + litellm_params: + api_key: API_KEY + model: openai/gpt-4o-mini-2024-07-18 + rpm: 0 + tpm: 100 + +router_settings: + num_retries: 0 + routing_strategy: usage-based-routing-v2 + timeout: 10 + +litellm_settings: + callbacks: custom_callbacks.proxy_handler_instance diff --git a/litellm/proxy/auth/user_api_key_auth.py b/litellm/proxy/auth/user_api_key_auth.py index 4c16c0345..2480263df 100644 --- a/litellm/proxy/auth/user_api_key_auth.py +++ b/litellm/proxy/auth/user_api_key_auth.py @@ -386,7 +386,6 @@ async def user_api_key_auth( parent_otel_span=parent_otel_span, ) #### ELSE #### - ## CHECK PASS-THROUGH ENDPOINTS ## if pass_through_endpoints is not None: for endpoint in pass_through_endpoints: diff --git a/litellm/proxy/custom_callbacks.py b/litellm/proxy/custom_callbacks.py index 40fc0d369..1516bfd24 100644 --- a/litellm/proxy/custom_callbacks.py +++ b/litellm/proxy/custom_callbacks.py @@ -1,66 +1,10 @@ from litellm.integrations.custom_logger import CustomLogger -import litellm -# This file includes the custom callbacks for LiteLLM Proxy -# Once defined, these can be passed in proxy_config.yaml class MyCustomHandler(CustomLogger): - def log_pre_api_call(self, model, messages, kwargs): - print(f"Pre-API Call") # noqa - - def log_post_api_call(self, kwargs, response_obj, start_time, end_time): - print(f"Post-API Call") # noqa - - def log_stream_event(self, kwargs, response_obj, start_time, end_time): - print(f"On Stream") # noqa - - def log_success_event(self, kwargs, response_obj, start_time, end_time): - print("On Success") # noqa - - def log_failure_event(self, kwargs, response_obj, start_time, end_time): - print(f"On Failure") # noqa - - async def async_log_success_event(self, kwargs, response_obj, start_time, end_time): - print(f"ishaan async_log_success_event") # noqa - # log: key, user, model, prompt, response, tokens, cost - # Access kwargs passed to litellm.completion() - model = kwargs.get("model", None) - messages = kwargs.get("messages", None) - user = kwargs.get("user", None) - - # Access litellm_params passed to litellm.completion(), example access `metadata` - litellm_params = kwargs.get("litellm_params", {}) - metadata = litellm_params.get( - "metadata", {} - ) # headers passed to LiteLLM proxy, can be found here - - return - async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time): - try: - print(f"On Async Failure !") # noqa - print("\nkwargs", kwargs) # noqa - # Access kwargs passed to litellm.completion() - model = kwargs.get("model", None) - messages = kwargs.get("messages", None) - user = kwargs.get("user", None) - - # Access litellm_params passed to litellm.completion(), example access `metadata` - litellm_params = kwargs.get("litellm_params", {}) - metadata = litellm_params.get( - "metadata", {} - ) # headers passed to LiteLLM proxy, can be found here - - # Acess Exceptions & Traceback - exception_event = kwargs.get("exception", None) - traceback_event = kwargs.get("traceback_exception", None) - - # Calculate cost using litellm.completion_cost() - except Exception as e: - print(f"Exception: {e}") # noqa + # print("Call failed") + pass proxy_handler_instance = MyCustomHandler() - -# Set litellm.callbacks = [proxy_handler_instance] on the proxy -# need to set litellm.callbacks = [proxy_handler_instance] # on the proxy diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py index be8ccbaeb..b0eab1ba8 100644 --- a/litellm/proxy/proxy_server.py +++ b/litellm/proxy/proxy_server.py @@ -6183,6 +6183,64 @@ async def delete_end_user( pass +@router.get( + "/customer/list", + tags=["Customer Management"], + dependencies=[Depends(user_api_key_auth)], + response_model=List[LiteLLM_EndUserTable], +) +@router.get( + "/end_user/list", + tags=["Customer Management"], + include_in_schema=False, + dependencies=[Depends(user_api_key_auth)], +) +async def list_team( + http_request: Request, + user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth), +): + """ + [Admin-only] List all available customers + + ``` + curl --location --request GET 'http://0.0.0.0:4000/customer/list' \ + --header 'Authorization: Bearer sk-1234' + ``` + """ + from litellm.proxy.proxy_server import ( + _duration_in_seconds, + create_audit_log_for_update, + litellm_proxy_admin_name, + prisma_client, + ) + + if ( + user_api_key_dict.user_role != LitellmUserRoles.PROXY_ADMIN + and user_api_key_dict.user_role != LitellmUserRoles.PROXY_ADMIN_VIEW_ONLY + ): + raise HTTPException( + status_code=401, + detail={ + "error": "Admin-only endpoint. Your user role={}".format( + user_api_key_dict.user_role + ) + }, + ) + + if prisma_client is None: + raise HTTPException( + status_code=400, + detail={"error": CommonProxyErrors.db_not_connected_error.value}, + ) + + response = await prisma_client.db.litellm_endusertable.find_many() + + returned_response: List[LiteLLM_EndUserTable] = [] + for item in response: + returned_response.append(LiteLLM_EndUserTable(**item.model_dump())) + return returned_response + + async def create_audit_log_for_update(request_data: LiteLLM_AuditLogs): if premium_user is not True: return diff --git a/litellm/router.py b/litellm/router.py index 15fb4cb27..2743a36b9 100644 --- a/litellm/router.py +++ b/litellm/router.py @@ -47,6 +47,7 @@ from litellm._logging import verbose_router_logger from litellm.assistants.main import AssistantDeleted from litellm.caching import DualCache, InMemoryCache, RedisCache from litellm.integrations.custom_logger import CustomLogger +from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLogging from litellm.llms.azure import get_azure_ad_token_from_oidc from litellm.router_strategy.least_busy import LeastBusyLoggingHandler from litellm.router_strategy.lowest_cost import LowestCostLoggingHandler @@ -783,6 +784,10 @@ class Router: } ) + logging_obj: Optional[LiteLLMLogging] = kwargs.get( + "litellm_logging_obj", None + ) + rpm_semaphore = self._get_client( deployment=deployment, kwargs=kwargs, @@ -797,11 +802,13 @@ class Router: - If allowed, increment the rpm limit (allows global value to be updated, concurrency-safe) """ await self.async_routing_strategy_pre_call_checks( - deployment=deployment + deployment=deployment, logging_obj=logging_obj ) response = await _response else: - await self.async_routing_strategy_pre_call_checks(deployment=deployment) + await self.async_routing_strategy_pre_call_checks( + deployment=deployment, logging_obj=logging_obj + ) response = await _response ## CHECK CONTENT FILTER ERROR ## @@ -3860,7 +3867,9 @@ class Router: if isinstance(_callback, CustomLogger): response = _callback.pre_call_check(deployment) - async def async_routing_strategy_pre_call_checks(self, deployment: dict): + async def async_routing_strategy_pre_call_checks( + self, deployment: dict, logging_obj: Optional[LiteLLMLogging] = None + ): """ For usage-based-routing-v2, enables running rpm checks before the call is made, inside the semaphore. @@ -3875,8 +3884,22 @@ class Router: for _callback in litellm.callbacks: if isinstance(_callback, CustomLogger): try: - response = await _callback.async_pre_call_check(deployment) + _ = await _callback.async_pre_call_check(deployment) except litellm.RateLimitError as e: + ## LOG FAILURE EVENT + if logging_obj is not None: + asyncio.create_task( + logging_obj.async_failure_handler( + exception=e, + traceback_exception=traceback.format_exc(), + end_time=time.time(), + ) + ) + ## LOGGING + threading.Thread( + target=logging_obj.failure_handler, + args=(e, traceback.format_exc()), + ).start() # log response self._set_cooldown_deployments( exception_status=e.status_code, original_exception=e, @@ -3885,6 +3908,20 @@ class Router: ) raise e except Exception as e: + ## LOG FAILURE EVENT + if logging_obj is not None: + asyncio.create_task( + logging_obj.async_failure_handler( + exception=e, + traceback_exception=traceback.format_exc(), + end_time=time.time(), + ) + ) + ## LOGGING + threading.Thread( + target=logging_obj.failure_handler, + args=(e, traceback.format_exc()), + ).start() # log response raise e def _generate_model_id(self, model_group: str, litellm_params: dict): diff --git a/litellm/tests/test_anthropic_prompt_caching.py b/litellm/tests/test_anthropic_prompt_caching.py index b9c70f0c3..06f6916ed 100644 --- a/litellm/tests/test_anthropic_prompt_caching.py +++ b/litellm/tests/test_anthropic_prompt_caching.py @@ -222,6 +222,94 @@ async def test_anthropic_api_prompt_caching_basic(): ) +@pytest.mark.asyncio() +async def test_anthropic_api_prompt_caching_with_content_str(): + from litellm.llms.prompt_templates.factory import anthropic_messages_pt + + system_message = [ + { + "role": "system", + "content": "Here is the full text of a complex legal agreement", + "cache_control": {"type": "ephemeral"}, + }, + ] + translated_system_message = litellm.AnthropicConfig().translate_system_message( + messages=system_message + ) + + assert translated_system_message == [ + # System Message + { + "type": "text", + "text": "Here is the full text of a complex legal agreement", + "cache_control": {"type": "ephemeral"}, + } + ] + user_messages = [ + # marked for caching with the cache_control parameter, so that this checkpoint can read from the previous cache. + { + "role": "user", + "content": "What are the key terms and conditions in this agreement?", + "cache_control": {"type": "ephemeral"}, + }, + { + "role": "assistant", + "content": "Certainly! the key terms and conditions are the following: the contract is 1 year long for $10/mo", + }, + # The final turn is marked with cache-control, for continuing in followups. + { + "role": "user", + "content": "What are the key terms and conditions in this agreement?", + "cache_control": {"type": "ephemeral"}, + }, + ] + + translated_messages = anthropic_messages_pt( + messages=user_messages, + model="claude-3-5-sonnet-20240620", + llm_provider="anthropic", + ) + + expected_messages = [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "What are the key terms and conditions in this agreement?", + "cache_control": {"type": "ephemeral"}, + } + ], + }, + { + "role": "assistant", + "content": [ + { + "type": "text", + "text": "Certainly! the key terms and conditions are the following: the contract is 1 year long for $10/mo", + } + ], + }, + # The final turn is marked with cache-control, for continuing in followups. + { + "role": "user", + "content": [ + { + "type": "text", + "text": "What are the key terms and conditions in this agreement?", + "cache_control": {"type": "ephemeral"}, + } + ], + }, + ] + + assert len(translated_messages) == len(expected_messages) + for idx, i in enumerate(translated_messages): + assert ( + i == expected_messages[idx] + ), "Error on idx={}. Got={}, Expected={}".format(idx, i, expected_messages[idx]) + + @pytest.mark.asyncio() async def test_anthropic_api_prompt_caching_no_headers(): litellm.set_verbose = True diff --git a/litellm/tests/test_bedrock_completion.py b/litellm/tests/test_bedrock_completion.py index e6c657f07..bc27c5118 100644 --- a/litellm/tests/test_bedrock_completion.py +++ b/litellm/tests/test_bedrock_completion.py @@ -616,8 +616,8 @@ def test_completion_bedrock_httpx_command_r_sts_oidc_auth(): aws_region_name=aws_region_name, aws_web_identity_token=aws_web_identity_token, aws_role_name=aws_role_name, - aws_session_name="my-test-session", - aws_sts_endpoint="https://sts-fips.us-west-2.amazonaws.com", + aws_session_name="cross-region-test", + aws_sts_endpoint="https://sts-fips.us-east-2.amazonaws.com", aws_bedrock_runtime_endpoint="https://bedrock-runtime-fips.us-west-2.amazonaws.com", ) # Add any assertions here to check the response diff --git a/litellm/types/llms/anthropic.py b/litellm/types/llms/anthropic.py index 7b856a284..720abf8dd 100644 --- a/litellm/types/llms/anthropic.py +++ b/litellm/types/llms/anthropic.py @@ -3,6 +3,8 @@ from typing import Any, Dict, Iterable, List, Optional, Union from pydantic import BaseModel, validator from typing_extensions import Literal, Required, TypedDict +from .openai import ChatCompletionCachedContent + class AnthropicMessagesToolChoice(TypedDict, total=False): type: Required[Literal["auto", "any", "tool"]] @@ -18,7 +20,7 @@ class AnthropicMessagesTool(TypedDict, total=False): class AnthropicMessagesTextParam(TypedDict, total=False): type: Literal["text"] text: str - cache_control: Optional[dict] + cache_control: Optional[Union[dict, ChatCompletionCachedContent]] class AnthropicMessagesToolUseParam(TypedDict): @@ -58,7 +60,7 @@ class AnthropicImageParamSource(TypedDict): class AnthropicMessagesImageParam(TypedDict, total=False): type: Literal["image"] source: AnthropicImageParamSource - cache_control: Optional[dict] + cache_control: Optional[Union[dict, ChatCompletionCachedContent]] class AnthropicMessagesToolResultContent(TypedDict): @@ -97,7 +99,7 @@ class AnthropicMetadata(TypedDict, total=False): class AnthropicSystemMessageContent(TypedDict, total=False): type: str text: str - cache_control: Optional[dict] + cache_control: Optional[Union[dict, ChatCompletionCachedContent]] class AnthropicMessagesRequest(TypedDict, total=False): diff --git a/litellm/types/llms/openai.py b/litellm/types/llms/openai.py index 0219145c6..788199c00 100644 --- a/litellm/types/llms/openai.py +++ b/litellm/types/llms/openai.py @@ -354,14 +354,18 @@ class ChatCompletionImageObject(TypedDict): image_url: ChatCompletionImageUrlObject -class ChatCompletionUserMessage(TypedDict): +class OpenAIChatCompletionUserMessage(TypedDict): role: Literal["user"] content: Union[ str, Iterable[Union[ChatCompletionTextObject, ChatCompletionImageObject]] ] -class ChatCompletionAssistantMessage(TypedDict, total=False): +class ChatCompletionUserMessage(OpenAIChatCompletionUserMessage, total=False): + cache_control: ChatCompletionCachedContent + + +class OpenAIChatCompletionAssistantMessage(TypedDict, total=False): role: Required[Literal["assistant"]] content: Optional[Union[str, Iterable[ChatCompletionTextObject]]] name: Optional[str] @@ -369,6 +373,10 @@ class ChatCompletionAssistantMessage(TypedDict, total=False): function_call: Optional[ChatCompletionToolCallFunctionChunk] +class ChatCompletionAssistantMessage(OpenAIChatCompletionAssistantMessage, total=False): + cache_control: ChatCompletionCachedContent + + class ChatCompletionToolMessage(TypedDict): role: Literal["tool"] content: str @@ -381,12 +389,16 @@ class ChatCompletionFunctionMessage(TypedDict): name: str -class ChatCompletionSystemMessage(TypedDict, total=False): +class OpenAIChatCompletionSystemMessage(TypedDict, total=False): role: Required[Literal["system"]] content: Required[Union[str, List]] name: str +class ChatCompletionSystemMessage(OpenAIChatCompletionSystemMessage, total=False): + cache_control: ChatCompletionCachedContent + + AllMessageValues = Union[ ChatCompletionUserMessage, ChatCompletionAssistantMessage, diff --git a/litellm/utils.py b/litellm/utils.py index 06de72b9d..33d3a59a3 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -8547,11 +8547,6 @@ class CustomStreamWrapper: "finish_reason": finish_reason, } except Exception as e: - verbose_logger.exception( - "litellm.CustomStreamWrapper.handle_predibase_chunk(): Exception occured - {}".format( - str(e) - ) - ) raise e def handle_huggingface_chunk(self, chunk): @@ -8595,11 +8590,6 @@ class CustomStreamWrapper: "finish_reason": finish_reason, } except Exception as e: - verbose_logger.exception( - "litellm.CustomStreamWrapper.handle_huggingface_chunk(): Exception occured - {}".format( - str(e) - ) - ) raise e def handle_ai21_chunk(self, chunk): # fake streaming @@ -8826,11 +8816,6 @@ class CustomStreamWrapper: "usage": usage, } except Exception as e: - verbose_logger.exception( - "litellm.CustomStreamWrapper.handle_openai_chat_completion_chunk(): Exception occured - {}".format( - str(e) - ) - ) raise e def handle_azure_text_completion_chunk(self, chunk):