diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d429bc6b8..a33473b72 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,12 +1,12 @@ repos: - repo: local hooks: - # - id: mypy - # name: mypy - # entry: python3 -m mypy --ignore-missing-imports - # language: system - # types: [python] - # files: ^litellm/ + - id: mypy + name: mypy + entry: python3 -m mypy --ignore-missing-imports + language: system + types: [python] + files: ^litellm/ - id: isort name: isort entry: isort diff --git a/docs/my-website/docs/proxy/alerting.md b/docs/my-website/docs/proxy/alerting.md index 7841ace58..257f5af81 100644 --- a/docs/my-website/docs/proxy/alerting.md +++ b/docs/my-website/docs/proxy/alerting.md @@ -190,6 +190,36 @@ curl -i http://localhost:4000/v1/chat/completions \ ``` +## Advanced - provide multiple slack channels for a given alert type + +Just add it like this - `alert_type: [, ]`. + +1. Setup config.yaml + +```yaml +general_settings: + master_key: sk-1234 + alerting: ["slack"] + alert_to_webhook_url: { + "spend_reports": ["https://webhook.site/7843a980-a494-4967-80fb-d502dbc16886", "https://webhook.site/28cfb179-f4fb-4408-8129-729ff55cf213"] + } +``` + +2. Start proxy + +```bash +litellm --config /path/to/config.yaml +``` + +3. Test it! + +```bash +curl -X GET 'http://0.0.0.0:4000/health/services?service=slack' \ +-H 'Authorization: Bearer sk-1234' +``` + +In case of error, check server logs for the error message! + ## Advanced - Using MS Teams Webhooks MS Teams provides a slack compatible webhook url that you can use for alerting diff --git a/litellm/__init__.py b/litellm/__init__.py index f67e2ca83..496bb9db7 100644 --- a/litellm/__init__.py +++ b/litellm/__init__.py @@ -900,6 +900,14 @@ from .llms.bedrock.common_utils import ( AmazonMistralConfig, AmazonBedrockGlobalConfig, ) +from .llms.bedrock.embed.amazon_titan_g1_transformation import AmazonTitanG1Config +from .llms.bedrock.embed.amazon_titan_multimodal_transformation import ( + AmazonTitanMultimodalEmbeddingG1Config, +) +from .llms.bedrock.embed.amazon_titan_v2_transformation import ( + AmazonTitanV2Config, +) +from .llms.bedrock.embed.cohere_transformation import BedrockCohereEmbeddingConfig from .llms.openai import ( OpenAIConfig, OpenAITextCompletionConfig, diff --git a/litellm/integrations/slack_alerting.py b/litellm/integrations/slack_alerting.py index bc90e86a8..20accb1b4 100644 --- a/litellm/integrations/slack_alerting.py +++ b/litellm/integrations/slack_alerting.py @@ -1514,7 +1514,9 @@ Model Info: self.alert_to_webhook_url is not None and alert_type in self.alert_to_webhook_url ): - slack_webhook_url = self.alert_to_webhook_url[alert_type] + slack_webhook_url: Optional[Union[str, List[str]]] = ( + self.alert_to_webhook_url[alert_type] + ) elif self.default_webhook_url is not None: slack_webhook_url = self.default_webhook_url else: @@ -1525,18 +1527,39 @@ Model Info: payload = {"text": formatted_message} headers = {"Content-type": "application/json"} - response = await self.async_http_handler.post( - url=slack_webhook_url, - headers=headers, - data=json.dumps(payload), - ) - if response.status_code == 200: - pass - else: - verbose_proxy_logger.debug( - "Error sending slack alert. Error={}".format(response.text) + async def send_to_webhook(url: str): + return await self.async_http_handler.post( + url=url, + headers=headers, + data=json.dumps(payload), ) + if isinstance(slack_webhook_url, list): + # Parallelize the calls if it's a list of URLs + responses = await asyncio.gather( + *[send_to_webhook(url) for url in slack_webhook_url] + ) + + for response, url in zip(responses, slack_webhook_url): + if response.status_code == 200: + pass + else: + verbose_proxy_logger.debug( + "Error sending slack alert to url={}. Error={}".format( + url, response.text + ) + ) + else: + # Single call if it's a single URL + response = await send_to_webhook(slack_webhook_url) + + if response.status_code == 200: + pass + else: + verbose_proxy_logger.debug( + "Error sending slack alert. Error={}".format(response.text) + ) + async def async_log_success_event(self, kwargs, response_obj, start_time, end_time): """Log deployment latency""" try: @@ -1718,7 +1741,9 @@ Model Info: try: from calendar import monthrange - from litellm.proxy.proxy_server import _get_spend_report_for_time_range + from litellm.proxy.spend_tracking.spend_management_endpoints import ( + _get_spend_report_for_time_range, + ) todays_date = datetime.datetime.now().date() first_day_of_month = todays_date.replace(day=1) @@ -1763,7 +1788,7 @@ Model Info: alerting_metadata={}, ) except Exception as e: - verbose_proxy_logger.error("Error sending weekly spend report %s", e) + verbose_proxy_logger.exception("Error sending weekly spend report %s", e) async def send_fallback_stats_from_prometheus(self): """ diff --git a/litellm/llms/anthropic/chat.py b/litellm/llms/anthropic/chat.py index f62c7246e..c3ad03859 100644 --- a/litellm/llms/anthropic/chat.py +++ b/litellm/llms/anthropic/chat.py @@ -30,6 +30,7 @@ from litellm.llms.custom_httpx.http_handler import ( ) from litellm.types.llms.anthropic import ( AnthopicMessagesAssistantMessageParam, + AnthropicChatCompletionUsageBlock, AnthropicFinishReason, AnthropicMessagesRequest, AnthropicMessagesTool, @@ -1177,6 +1178,30 @@ class ModelResponseIterator: return True return False + def _handle_usage( + self, anthropic_usage_chunk: dict + ) -> AnthropicChatCompletionUsageBlock: + special_fields = ["input_tokens", "output_tokens"] + + usage_block = AnthropicChatCompletionUsageBlock( + prompt_tokens=anthropic_usage_chunk.get("input_tokens", 0), + completion_tokens=anthropic_usage_chunk.get("output_tokens", 0), + total_tokens=anthropic_usage_chunk.get("input_tokens", 0) + + anthropic_usage_chunk.get("output_tokens", 0), + ) + + if "cache_creation_input_tokens" in anthropic_usage_chunk: + usage_block["cache_creation_input_tokens"] = anthropic_usage_chunk[ + "cache_creation_input_tokens" + ] + + if "cache_read_input_tokens" in anthropic_usage_chunk: + usage_block["cache_read_input_tokens"] = anthropic_usage_chunk[ + "cache_read_input_tokens" + ] + + return usage_block + def chunk_parser(self, chunk: dict) -> GenericStreamingChunk: try: type_chunk = chunk.get("type", "") or "" @@ -1252,12 +1277,7 @@ class ModelResponseIterator: finish_reason=message_delta["delta"].get("stop_reason", "stop") or "stop" ) - usage = ChatCompletionUsageBlock( - prompt_tokens=message_delta["usage"].get("input_tokens", 0), - completion_tokens=message_delta["usage"].get("output_tokens", 0), - total_tokens=message_delta["usage"].get("input_tokens", 0) - + message_delta["usage"].get("output_tokens", 0), - ) + usage = self._handle_usage(anthropic_usage_chunk=message_delta["usage"]) is_finished = True elif type_chunk == "message_start": """ @@ -1280,19 +1300,8 @@ class ModelResponseIterator: } """ message_start_block = MessageStartBlock(**chunk) # type: ignore - usage = ChatCompletionUsageBlock( - prompt_tokens=message_start_block["message"] - .get("usage", {}) - .get("input_tokens", 0), - completion_tokens=message_start_block["message"] - .get("usage", {}) - .get("output_tokens", 0), - total_tokens=message_start_block["message"] - .get("usage", {}) - .get("input_tokens", 0) - + message_start_block["message"] - .get("usage", {}) - .get("output_tokens", 0), + usage = self._handle_usage( + anthropic_usage_chunk=message_start_block["message"]["usage"] ) elif type_chunk == "error": """ diff --git a/litellm/llms/bedrock/chat.py b/litellm/llms/bedrock/chat.py index 0289b5dc3..73e649c5b 100644 --- a/litellm/llms/bedrock/chat.py +++ b/litellm/llms/bedrock/chat.py @@ -43,6 +43,10 @@ from litellm.types.llms.openai import ( ChatCompletionResponseMessage, ChatCompletionToolCallChunk, ChatCompletionToolCallFunctionChunk, + ChatCompletionToolChoiceFunctionParam, + ChatCompletionToolChoiceObjectParam, + ChatCompletionToolParam, + ChatCompletionToolParamFunctionChunk, ChatCompletionUsageBlock, ) from litellm.types.utils import GenericStreamingChunk as GChunk @@ -1152,6 +1156,7 @@ class AmazonConverseConfig: "temperature", "top_p", "extra_headers", + "response_format", ] if ( @@ -1210,6 +1215,48 @@ class AmazonConverseConfig: drop_params: bool, ) -> dict: for param, value in non_default_params.items(): + if param == "response_format": + json_schema: Optional[dict] = None + schema_name: str = "" + if "response_schema" in value: + json_schema = value["response_schema"] + schema_name = "json_tool_call" + elif "json_schema" in value: + json_schema = value["json_schema"]["schema"] + schema_name = value["json_schema"]["name"] + """ + Follow similar approach to anthropic - translate to a single tool call. + + When using tools in this way: - https://docs.anthropic.com/en/docs/build-with-claude/tool-use#json-mode + - You usually want to provide a single tool + - You should set tool_choice (see Forcing tool use) to instruct the model to explicitly use that tool + - Remember that the model will pass the input to the tool, so the name of the tool and description should be from the model’s perspective. + """ + if json_schema is not None: + _tool_choice = self.map_tool_choice_values( + model=model, tool_choice="required", drop_params=drop_params # type: ignore + ) + + _tool = ChatCompletionToolParam( + type="function", + function=ChatCompletionToolParamFunctionChunk( + name=schema_name, parameters=json_schema + ), + ) + + optional_params["tools"] = [_tool] + optional_params["tool_choice"] = _tool_choice + optional_params["json_mode"] = True + else: + if litellm.drop_params is True or drop_params is True: + pass + else: + raise litellm.utils.UnsupportedParamsError( + message="Bedrock doesn't support response_format={}. To drop it from the call, set `litellm.drop_params = True.".format( + value + ), + status_code=400, + ) if param == "max_tokens": optional_params["maxTokens"] = value if param == "stream": @@ -1263,7 +1310,7 @@ class BedrockConverseLLM(BaseAWSLLM): additional_args={"complete_input_dict": data}, ) print_verbose(f"raw model_response: {response.text}") - + json_mode: Optional[bool] = optional_params.pop("json_mode", None) ## RESPONSE OBJECT try: completion_response = ConverseResponseBlock(**response.json()) # type: ignore @@ -1332,6 +1379,7 @@ class BedrockConverseLLM(BaseAWSLLM): name=response_tool_name, arguments=json.dumps(content["toolUse"]["input"]), ) + _tool_response_chunk = ChatCompletionToolCallChunk( id=content["toolUse"]["toolUseId"], type="function", @@ -1340,7 +1388,14 @@ class BedrockConverseLLM(BaseAWSLLM): ) tools.append(_tool_response_chunk) chat_completion_message["content"] = content_str - chat_completion_message["tool_calls"] = tools + + if json_mode is True and tools is not None and len(tools) == 1: + # to support 'json_schema' logic on bedrock models + json_mode_content_str: Optional[str] = tools[0]["function"].get("arguments") + if json_mode_content_str is not None: + chat_completion_message["content"] = json_mode_content_str + else: + chat_completion_message["tool_calls"] = tools ## CALCULATING USAGE - bedrock returns usage in the headers input_tokens = completion_response["usage"]["inputTokens"] @@ -1586,6 +1641,9 @@ class BedrockConverseLLM(BaseAWSLLM): supported_converse_params = AmazonConverseConfig.__annotations__.keys() supported_tool_call_params = ["tools", "tool_choice"] supported_guardrail_params = ["guardrailConfig"] + json_mode: Optional[bool] = inference_params.pop( + "json_mode", None + ) # used for handling json_schema ## TRANSFORMATION ## bedrock_messages: List[MessageBlock] = _bedrock_converse_messages_pt( @@ -2028,8 +2086,14 @@ class MockResponseIterator: # for returning ai21 streaming responses text=chunk_data.choices[0].message.content or "", # type: ignore tool_use=None, is_finished=True, - finish_reason=chunk_data.choices[0].finish_reason, # type: ignore - usage=chunk_usage, # type: ignore + finish_reason=map_finish_reason( + finish_reason=chunk_data.choices[0].finish_reason or "" + ), + usage=ChatCompletionUsageBlock( + prompt_tokens=chunk_usage.prompt_tokens, + completion_tokens=chunk_usage.completion_tokens, + total_tokens=chunk_usage.total_tokens, + ), index=0, ) return processed_chunk diff --git a/litellm/llms/bedrock/embed/amazon_titan_g1_transformation.py b/litellm/llms/bedrock/embed/amazon_titan_g1_transformation.py index c86bade5d..591f87209 100644 --- a/litellm/llms/bedrock/embed/amazon_titan_g1_transformation.py +++ b/litellm/llms/bedrock/embed/amazon_titan_g1_transformation.py @@ -15,8 +15,6 @@ from typing import List, Optional from litellm.types.llms.bedrock import ( AmazonTitanG1EmbeddingRequest, AmazonTitanG1EmbeddingResponse, - AmazonTitanV2EmbeddingRequest, - AmazonTitanV2EmbeddingResponse, ) from litellm.types.utils import Embedding, EmbeddingResponse, Usage @@ -52,6 +50,14 @@ class AmazonTitanG1Config: and v is not None } + def get_supported_openai_params(self) -> List[str]: + return [] + + def map_openai_params( + self, non_default_params: dict, optional_params: dict + ) -> dict: + return optional_params + def _transform_request( self, input: str, inference_params: dict ) -> AmazonTitanG1EmbeddingRequest: @@ -80,70 +86,3 @@ class AmazonTitanG1Config: total_tokens=total_prompt_tokens, ) return EmbeddingResponse(model=model, usage=usage, data=transformed_responses) - - -class AmazonTitanV2Config: - """ - Reference: https://docs.aws.amazon.com/bedrock/latest/userguide/model-parameters-titan-embed-text.html - - normalize: boolean - flag indicating whether or not to normalize the output embeddings. Defaults to true - dimensions: int - The number of dimensions the output embeddings should have. The following values are accepted: 1024 (default), 512, 256. - """ - - normalize: Optional[bool] = None - dimensions: Optional[int] = None - - def __init__( - self, normalize: Optional[bool] = None, dimensions: Optional[int] = None - ) -> None: - locals_ = locals() - for key, value in locals_.items(): - if key != "self" and value is not None: - setattr(self.__class__, key, value) - - @classmethod - def get_config(cls): - return { - k: v - for k, v in cls.__dict__.items() - if not k.startswith("__") - and not isinstance( - v, - ( - types.FunctionType, - types.BuiltinFunctionType, - classmethod, - staticmethod, - ), - ) - and v is not None - } - - def _transform_request( - self, input: str, inference_params: dict - ) -> AmazonTitanV2EmbeddingRequest: - return AmazonTitanV2EmbeddingRequest(inputText=input, **inference_params) # type: ignore - - def _transform_response( - self, response_list: List[dict], model: str - ) -> EmbeddingResponse: - total_prompt_tokens = 0 - - transformed_responses: List[Embedding] = [] - for index, response in enumerate(response_list): - _parsed_response = AmazonTitanV2EmbeddingResponse(**response) # type: ignore - transformed_responses.append( - Embedding( - embedding=_parsed_response["embedding"], - index=index, - object="embedding", - ) - ) - total_prompt_tokens += _parsed_response["inputTextTokenCount"] - - usage = Usage( - prompt_tokens=total_prompt_tokens, - completion_tokens=0, - total_tokens=total_prompt_tokens, - ) - return EmbeddingResponse(model=model, usage=usage, data=transformed_responses) diff --git a/litellm/llms/bedrock/embed/amazon_titan_multimodal_transformation.py b/litellm/llms/bedrock/embed/amazon_titan_multimodal_transformation.py index 7e2b6176d..6becff6ef 100644 --- a/litellm/llms/bedrock/embed/amazon_titan_multimodal_transformation.py +++ b/litellm/llms/bedrock/embed/amazon_titan_multimodal_transformation.py @@ -17,38 +17,64 @@ from litellm.types.utils import Embedding, EmbeddingResponse, Usage from litellm.utils import is_base64_encoded -def _transform_request( - input: str, inference_params: dict -) -> AmazonTitanMultimodalEmbeddingRequest: - ## check if b64 encoded str or not ## - is_encoded = is_base64_encoded(input) - if is_encoded: # check if string is b64 encoded image or not - transformed_request = AmazonTitanMultimodalEmbeddingRequest(inputImage=input) - else: - transformed_request = AmazonTitanMultimodalEmbeddingRequest(inputText=input) +class AmazonTitanMultimodalEmbeddingG1Config: + """ + Reference - https://docs.aws.amazon.com/bedrock/latest/userguide/model-parameters-titan-embed-mm.html + """ - for k, v in inference_params.items(): - transformed_request[k] = v # type: ignore + def __init__(self) -> None: + pass - return transformed_request + def get_supported_openai_params(self) -> List[str]: + return ["dimensions"] + def map_openai_params( + self, non_default_params: dict, optional_params: dict + ) -> dict: + for k, v in non_default_params.items(): + if k == "dimensions": + optional_params["embeddingConfig"] = ( + AmazonTitanMultimodalEmbeddingConfig(outputEmbeddingLength=v) + ) + return optional_params -def _transform_response(response_list: List[dict], model: str) -> EmbeddingResponse: - - total_prompt_tokens = 0 - transformed_responses: List[Embedding] = [] - for index, response in enumerate(response_list): - _parsed_response = AmazonTitanMultimodalEmbeddingResponse(**response) # type: ignore - transformed_responses.append( - Embedding( - embedding=_parsed_response["embedding"], index=index, object="embedding" + def _transform_request( + self, input: str, inference_params: dict + ) -> AmazonTitanMultimodalEmbeddingRequest: + ## check if b64 encoded str or not ## + is_encoded = is_base64_encoded(input) + if is_encoded: # check if string is b64 encoded image or not + transformed_request = AmazonTitanMultimodalEmbeddingRequest( + inputImage=input ) - ) - total_prompt_tokens += _parsed_response["inputTextTokenCount"] + else: + transformed_request = AmazonTitanMultimodalEmbeddingRequest(inputText=input) - usage = Usage( - prompt_tokens=total_prompt_tokens, - completion_tokens=0, - total_tokens=total_prompt_tokens, - ) - return EmbeddingResponse(model=model, usage=usage, data=transformed_responses) + for k, v in inference_params.items(): + transformed_request[k] = v # type: ignore + + return transformed_request + + def _transform_response( + self, response_list: List[dict], model: str + ) -> EmbeddingResponse: + + total_prompt_tokens = 0 + transformed_responses: List[Embedding] = [] + for index, response in enumerate(response_list): + _parsed_response = AmazonTitanMultimodalEmbeddingResponse(**response) # type: ignore + transformed_responses.append( + Embedding( + embedding=_parsed_response["embedding"], + index=index, + object="embedding", + ) + ) + total_prompt_tokens += _parsed_response["inputTextTokenCount"] + + usage = Usage( + prompt_tokens=total_prompt_tokens, + completion_tokens=0, + total_tokens=total_prompt_tokens, + ) + return EmbeddingResponse(model=model, usage=usage, data=transformed_responses) diff --git a/litellm/llms/bedrock/embed/amazon_titan_v2_transformation.py b/litellm/llms/bedrock/embed/amazon_titan_v2_transformation.py index a9c980dbb..8244a9a33 100644 --- a/litellm/llms/bedrock/embed/amazon_titan_v2_transformation.py +++ b/litellm/llms/bedrock/embed/amazon_titan_v2_transformation.py @@ -56,6 +56,17 @@ class AmazonTitanV2Config: and v is not None } + def get_supported_openai_params(self) -> List[str]: + return ["dimensions"] + + def map_openai_params( + self, non_default_params: dict, optional_params: dict + ) -> dict: + for k, v in non_default_params.items(): + if k == "dimensions": + optional_params["dimensions"] = v + return optional_params + def _transform_request( self, input: str, inference_params: dict ) -> AmazonTitanV2EmbeddingRequest: diff --git a/litellm/llms/bedrock/embed/cohere_transformation.py b/litellm/llms/bedrock/embed/cohere_transformation.py index 2d5fbe8c2..7a1ab75fd 100644 --- a/litellm/llms/bedrock/embed/cohere_transformation.py +++ b/litellm/llms/bedrock/embed/cohere_transformation.py @@ -11,15 +11,30 @@ from litellm.types.llms.bedrock import CohereEmbeddingRequest, CohereEmbeddingRe from litellm.types.utils import Embedding, EmbeddingResponse -def _transform_request( - input: List[str], inference_params: dict -) -> CohereEmbeddingRequest: - transformed_request = CohereEmbeddingRequest( - texts=input, - input_type=litellm.COHERE_DEFAULT_EMBEDDING_INPUT_TYPE, # type: ignore - ) +class BedrockCohereEmbeddingConfig: + def __init__(self) -> None: + pass - for k, v in inference_params.items(): - transformed_request[k] = v # type: ignore + def get_supported_openai_params(self) -> List[str]: + return ["encoding_format"] - return transformed_request + def map_openai_params( + self, non_default_params: dict, optional_params: dict + ) -> dict: + for k, v in non_default_params.items(): + if k == "encoding_format": + optional_params["embedding_types"] = v + return optional_params + + def _transform_request( + self, input: List[str], inference_params: dict + ) -> CohereEmbeddingRequest: + transformed_request = CohereEmbeddingRequest( + texts=input, + input_type=litellm.COHERE_DEFAULT_EMBEDDING_INPUT_TYPE, # type: ignore + ) + + for k, v in inference_params.items(): + transformed_request[k] = v # type: ignore + + return transformed_request diff --git a/litellm/llms/bedrock/embed/embedding.py b/litellm/llms/bedrock/embed/embedding.py index 6ad463cd0..6585ec4f2 100644 --- a/litellm/llms/bedrock/embed/embedding.py +++ b/litellm/llms/bedrock/embed/embedding.py @@ -16,6 +16,7 @@ from litellm.llms.cohere.embed import embedding as cohere_embedding from litellm.llms.custom_httpx.http_handler import ( AsyncHTTPHandler, HTTPHandler, + _get_async_httpx_client, _get_httpx_client, ) from litellm.types.llms.bedrock import AmazonEmbeddingRequest, CohereEmbeddingRequest @@ -25,13 +26,10 @@ from ...base_aws_llm import BaseAWSLLM from ..common_utils import BedrockError, get_runtime_endpoint from .amazon_titan_g1_transformation import AmazonTitanG1Config from .amazon_titan_multimodal_transformation import ( - _transform_request as amazon_multimodal_transform_request, -) -from .amazon_titan_multimodal_transformation import ( - _transform_response as amazon_multimodal_transform_response, + AmazonTitanMultimodalEmbeddingG1Config, ) from .amazon_titan_v2_transformation import AmazonTitanV2Config -from .cohere_transformation import _transform_request as cohere_transform_request +from .cohere_transformation import BedrockCohereEmbeddingConfig class BedrockEmbedding(BaseAWSLLM): @@ -118,6 +116,35 @@ class BedrockEmbedding(BaseAWSLLM): return response.json() + async def _make_async_call( + self, + client: Optional[AsyncHTTPHandler], + timeout: Optional[Union[float, httpx.Timeout]], + api_base: str, + headers: dict, + data: dict, + ) -> dict: + if client is None or not isinstance(client, AsyncHTTPHandler): + _params = {} + if timeout is not None: + if isinstance(timeout, float) or isinstance(timeout, int): + timeout = httpx.Timeout(timeout) + _params["timeout"] = timeout + client = _get_async_httpx_client(_params) # type: ignore + else: + client = client + + try: + response = await client.post(url=api_base, headers=headers, data=json.dumps(data)) # type: ignore + response.raise_for_status() + except httpx.HTTPStatusError as err: + error_code = err.response.status_code + raise BedrockError(status_code=error_code, message=response.text) + except httpx.TimeoutException: + raise BedrockError(status_code=408, message="Timeout error occurred.") + + return response.json() + def _single_func_embeddings( self, client: Optional[HTTPHandler], @@ -186,9 +213,102 @@ class BedrockEmbedding(BaseAWSLLM): ## TRANSFORM RESPONSE ## if model == "amazon.titan-embed-image-v1": - returned_response = amazon_multimodal_transform_response( + returned_response = ( + AmazonTitanMultimodalEmbeddingG1Config()._transform_response( + response_list=responses, model=model + ) + ) + elif model == "amazon.titan-embed-text-v1": + returned_response = AmazonTitanG1Config()._transform_response( response_list=responses, model=model ) + elif model == "amazon.titan-embed-text-v2:0": + returned_response = AmazonTitanV2Config()._transform_response( + response_list=responses, model=model + ) + + if returned_response is None: + raise Exception( + "Unable to map model response to known provider format. model={}".format( + model + ) + ) + + return returned_response + + async def _async_single_func_embeddings( + self, + client: Optional[AsyncHTTPHandler], + timeout: Optional[Union[float, httpx.Timeout]], + batch_data: List[dict], + credentials: Any, + extra_headers: Optional[dict], + endpoint_url: str, + aws_region_name: str, + model: str, + logging_obj: Any, + ): + try: + import boto3 + from botocore.auth import SigV4Auth + from botocore.awsrequest import AWSRequest + from botocore.credentials import Credentials + except ImportError: + raise ImportError("Missing boto3 to call bedrock. Run 'pip install boto3'.") + + responses: List[dict] = [] + for data in batch_data: + sigv4 = SigV4Auth(credentials, "bedrock", aws_region_name) + headers = {"Content-Type": "application/json"} + if extra_headers is not None: + headers = {"Content-Type": "application/json", **extra_headers} + request = AWSRequest( + method="POST", url=endpoint_url, data=json.dumps(data), headers=headers + ) + sigv4.add_auth(request) + if ( + extra_headers is not None and "Authorization" in extra_headers + ): # prevent sigv4 from overwriting the auth header + request.headers["Authorization"] = extra_headers["Authorization"] + prepped = request.prepare() + + ## LOGGING + logging_obj.pre_call( + input=data, + api_key="", + additional_args={ + "complete_input_dict": data, + "api_base": prepped.url, + "headers": prepped.headers, + }, + ) + response = await self._make_async_call( + client=client, + timeout=timeout, + api_base=prepped.url, + headers=prepped.headers, + data=data, + ) + + ## LOGGING + logging_obj.post_call( + input=data, + api_key="", + original_response=response, + additional_args={"complete_input_dict": data}, + ) + + responses.append(response) + + returned_response: Optional[EmbeddingResponse] = None + + ## TRANSFORM RESPONSE ## + if model == "amazon.titan-embed-image-v1": + returned_response = ( + AmazonTitanMultimodalEmbeddingG1Config()._transform_response( + response_list=responses, model=model + ) + ) elif model == "amazon.titan-embed-text-v1": returned_response = AmazonTitanG1Config()._transform_response( response_list=responses, model=model @@ -246,7 +366,7 @@ class BedrockEmbedding(BaseAWSLLM): data: Optional[CohereEmbeddingRequest] = None batch_data: Optional[List] = None if provider == "cohere": - data = cohere_transform_request( + data = BedrockCohereEmbeddingConfig()._transform_request( input=input, inference_params=inference_params ) elif provider == "amazon" and model in [ @@ -257,10 +377,10 @@ class BedrockEmbedding(BaseAWSLLM): batch_data = [] for i in input: if model == "amazon.titan-embed-image-v1": - transformed_request: AmazonEmbeddingRequest = ( - amazon_multimodal_transform_request( - input=i, inference_params=inference_params - ) + transformed_request: ( + AmazonEmbeddingRequest + ) = AmazonTitanMultimodalEmbeddingG1Config()._transform_request( + input=i, inference_params=inference_params ) elif model == "amazon.titan-embed-text-v1": transformed_request = AmazonTitanG1Config()._transform_request( @@ -283,6 +403,22 @@ class BedrockEmbedding(BaseAWSLLM): endpoint_url = f"{endpoint_url}/model/{modelId}/invoke" if batch_data is not None: + if aembedding: + return self._async_single_func_embeddings( # type: ignore + client=( + client + if client is not None and isinstance(client, AsyncHTTPHandler) + else None + ), + timeout=timeout, + batch_data=batch_data, + credentials=credentials, + extra_headers=extra_headers, + endpoint_url=endpoint_url, + aws_region_name=aws_region_name, + model=model, + logging_obj=logging_obj, + ) return self._single_func_embeddings( client=( client diff --git a/litellm/llms/databricks.py b/litellm/llms/databricks.py index 8cebaf775..3cc1c2456 100644 --- a/litellm/llms/databricks.py +++ b/litellm/llms/databricks.py @@ -703,8 +703,16 @@ class ModelResponseIterator: is_finished = True finish_reason = processed_chunk.choices[0].finish_reason - if hasattr(processed_chunk, "usage"): - usage = processed_chunk.usage # type: ignore + if hasattr(processed_chunk, "usage") and isinstance( + processed_chunk.usage, litellm.Usage + ): + usage_chunk: litellm.Usage = processed_chunk.usage + + usage = ChatCompletionUsageBlock( + prompt_tokens=usage_chunk.prompt_tokens, + completion_tokens=usage_chunk.completion_tokens, + total_tokens=usage_chunk.total_tokens, + ) return GenericStreamingChunk( text=text, diff --git a/litellm/main.py b/litellm/main.py index 70cd40f31..b5d73e439 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -5431,6 +5431,9 @@ def stream_chunk_builder( # # Update usage information if needed prompt_tokens = 0 completion_tokens = 0 + ## anthropic prompt caching information ## + cache_creation_input_tokens: Optional[int] = None + cache_read_input_tokens: Optional[int] = None for chunk in chunks: usage_chunk: Optional[Usage] = None if "usage" in chunk: @@ -5442,6 +5445,13 @@ def stream_chunk_builder( prompt_tokens = usage_chunk.get("prompt_tokens", 0) or 0 if "completion_tokens" in usage_chunk: completion_tokens = usage_chunk.get("completion_tokens", 0) or 0 + if "cache_creation_input_tokens" in usage_chunk: + cache_creation_input_tokens = usage_chunk.get( + "cache_creation_input_tokens" + ) + if "cache_read_input_tokens" in usage_chunk: + cache_read_input_tokens = usage_chunk.get("cache_read_input_tokens") + try: response["usage"]["prompt_tokens"] = prompt_tokens or token_counter( model=model, messages=messages @@ -5460,6 +5470,13 @@ def stream_chunk_builder( response["usage"]["prompt_tokens"] + response["usage"]["completion_tokens"] ) + if cache_creation_input_tokens is not None: + response["usage"][ + "cache_creation_input_tokens" + ] = cache_creation_input_tokens + if cache_read_input_tokens is not None: + response["usage"]["cache_read_input_tokens"] = cache_read_input_tokens + return convert_to_model_response_object( response_object=response, model_response_object=model_response, diff --git a/litellm/proxy/_new_secret_config.yaml b/litellm/proxy/_new_secret_config.yaml index 220da4932..2c3fcfa1b 100644 --- a/litellm/proxy/_new_secret_config.yaml +++ b/litellm/proxy/_new_secret_config.yaml @@ -2,3 +2,16 @@ model_list: - model_name: "gpt-3.5-turbo" litellm_params: model: "gpt-3.5-turbo" + +litellm_settings: + max_internal_user_budget: 0.02 # amount in USD + internal_user_budget_duration: "1s" # reset every second + +general_settings: + master_key: sk-1234 + alerting: ["slack"] + alerting_threshold: 0.0001 # (Seconds) set an artifically low threshold for testing alerting + alert_to_webhook_url: { + "spend_reports": ["https://webhook.site/7843a980-a494-4967-80fb-d502dbc16886", "https://webhook.site/28cfb179-f4fb-4408-8129-729ff55cf213"] + } + diff --git a/litellm/proxy/_types.py b/litellm/proxy/_types.py index 11d99b5ea..39f65ac2d 100644 --- a/litellm/proxy/_types.py +++ b/litellm/proxy/_types.py @@ -1632,6 +1632,16 @@ class AllCallbacks(LiteLLMBase): ui_callback_name="Langsmith", ) + lago: CallbackOnUI = CallbackOnUI( + litellm_callback_name="lago", + litellm_callback_params=[ + "LAGO_API_BASE", + "LAGO_API_KEY", + "LAGO_API_EVENT_CODE", + ], + ui_callback_name="Lago Billing", + ) + class SpendLogsMetadata(TypedDict): """ diff --git a/litellm/proxy/management_endpoints/internal_user_endpoints.py b/litellm/proxy/management_endpoints/internal_user_endpoints.py index 425dbe82c..76ffc9089 100644 --- a/litellm/proxy/management_endpoints/internal_user_endpoints.py +++ b/litellm/proxy/management_endpoints/internal_user_endpoints.py @@ -505,6 +505,10 @@ async def user_update( ): # models default to [], spend defaults to 0, we should not reset these values non_default_values[k] = v + is_internal_user = False + if data.user_role == LitellmUserRoles.INTERNAL_USER: + is_internal_user = True + if "budget_duration" in non_default_values: duration_s = _duration_in_seconds( duration=non_default_values["budget_duration"] @@ -512,6 +516,20 @@ async def user_update( user_reset_at = datetime.now(timezone.utc) + timedelta(seconds=duration_s) non_default_values["budget_reset_at"] = user_reset_at + if "max_budget" not in non_default_values: + if ( + is_internal_user and litellm.max_internal_user_budget is not None + ): # applies internal user limits, if user role updated + non_default_values["max_budget"] = litellm.max_internal_user_budget + + if ( + "budget_duration" not in non_default_values + ): # applies internal user limits, if user role updated + if is_internal_user and litellm.internal_user_budget_duration is not None: + non_default_values["budget_duration"] = ( + litellm.internal_user_budget_duration + ) + ## ADD USER, IF NEW ## verbose_proxy_logger.debug("/user/update: Received data = %s", data) if data.user_id is not None and len(data.user_id) > 0: diff --git a/litellm/tests/test_anthropic_prompt_caching.py b/litellm/tests/test_anthropic_prompt_caching.py index 4b7a4a3de..feb31e193 100644 --- a/litellm/tests/test_anthropic_prompt_caching.py +++ b/litellm/tests/test_anthropic_prompt_caching.py @@ -282,6 +282,82 @@ async def test_anthropic_api_prompt_caching_no_headers(): ) +@pytest.mark.asyncio() +async def test_anthropic_api_prompt_caching_streaming(): + from litellm.tests.test_streaming import streaming_format_tests + + response = await litellm.acompletion( + model="anthropic/claude-3-5-sonnet-20240620", + messages=[ + # System Message + { + "role": "system", + "content": [ + { + "type": "text", + "text": "Here is the full text of a complex legal agreement" + * 400, + "cache_control": {"type": "ephemeral"}, + } + ], + }, + # marked for caching with the cache_control parameter, so that this checkpoint can read from the previous cache. + { + "role": "user", + "content": [ + { + "type": "text", + "text": "What are the key terms and conditions in this agreement?", + "cache_control": {"type": "ephemeral"}, + } + ], + }, + { + "role": "assistant", + "content": "Certainly! the key terms and conditions are the following: the contract is 1 year long for $10/mo", + }, + # The final turn is marked with cache-control, for continuing in followups. + { + "role": "user", + "content": [ + { + "type": "text", + "text": "What are the key terms and conditions in this agreement?", + "cache_control": {"type": "ephemeral"}, + } + ], + }, + ], + temperature=0.2, + max_tokens=10, + stream=True, + stream_options={"include_usage": True}, + ) + + idx = 0 + is_cache_read_input_tokens_in_usage = False + is_cache_creation_input_tokens_in_usage = False + async for chunk in response: + streaming_format_tests(idx=idx, chunk=chunk) + # Assert either a cache entry was created or cache was read - changes depending on the anthropic api ttl + if hasattr(chunk, "usage"): + print("Received final usage - {}".format(chunk.usage)) + if hasattr(chunk, "usage") and hasattr(chunk.usage, "cache_read_input_tokens"): + is_cache_read_input_tokens_in_usage = True + if hasattr(chunk, "usage") and hasattr( + chunk.usage, "cache_creation_input_tokens" + ): + is_cache_creation_input_tokens_in_usage = True + + idx += 1 + + print("response=", response) + + assert ( + is_cache_read_input_tokens_in_usage and is_cache_creation_input_tokens_in_usage + ) + + @pytest.mark.asyncio async def test_litellm_anthropic_prompt_caching_system(): # https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching#prompt-caching-examples diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py index 8fdf722f0..5dd7681ce 100644 --- a/litellm/tests/test_completion.py +++ b/litellm/tests/test_completion.py @@ -2172,7 +2172,14 @@ def test_completion_openai(): pytest.fail(f"Error occurred: {e}") -@pytest.mark.parametrize("model", ["gpt-4o-2024-08-06", "azure/chatgpt-v-2"]) +@pytest.mark.parametrize( + "model", + [ + "gpt-4o-2024-08-06", + "azure/chatgpt-v-2", + "bedrock/anthropic.claude-3-sonnet-20240229-v1:0", + ], +) def test_completion_openai_pydantic(model): try: litellm.set_verbose = True @@ -2201,7 +2208,7 @@ def test_completion_openai_pydantic(model): ) break except litellm.JSONSchemaValidationError: - print("ERROR OCCURRED! INVALID JSON") + pytest.fail("ERROR OCCURRED! INVALID JSON") print("This is the response object\n", response) diff --git a/litellm/tests/test_embedding.py b/litellm/tests/test_embedding.py index 4067ef047..4af2cf990 100644 --- a/litellm/tests/test_embedding.py +++ b/litellm/tests/test_embedding.py @@ -319,9 +319,52 @@ async def test_cohere_embedding3(custom_llm_provider): "bedrock/amazon.titan-embed-text-v2:0", ], ) -@pytest.mark.parametrize("sync_mode", [True]) +@pytest.mark.parametrize("sync_mode", [True, False]) # , @pytest.mark.asyncio async def test_bedrock_embedding_titan(model, sync_mode): + try: + # this tests if we support str input for bedrock embedding + litellm.set_verbose = True + litellm.enable_cache() + import time + + current_time = str(time.time()) + # DO NOT MAKE THE INPUT A LIST in this test + if sync_mode: + response = embedding( + model=model, + input=f"good morning from litellm, attempting to embed data {current_time}", # input should always be a string in this test + aws_region_name="us-west-2", + ) + else: + response = await litellm.aembedding( + model=model, + input=f"good morning from litellm, attempting to embed data {current_time}", # input should always be a string in this test + aws_region_name="us-west-2", + ) + print("response:", response) + assert isinstance( + response["data"][0]["embedding"], list + ), "Expected response to be a list" + print("type of first embedding:", type(response["data"][0]["embedding"][0])) + assert all( + isinstance(x, float) for x in response["data"][0]["embedding"] + ), "Expected response to be a list of floats" + except Exception as e: + pytest.fail(f"Error occurred: {e}") + + +@pytest.mark.parametrize( + "model", + [ + "bedrock/amazon.titan-embed-text-v1", + "bedrock/amazon.titan-embed-image-v1", + "bedrock/amazon.titan-embed-text-v2:0", + ], +) +@pytest.mark.parametrize("sync_mode", [True]) # True, +@pytest.mark.asyncio +async def test_bedrock_embedding_titan_caching(model, sync_mode): try: # this tests if we support str input for bedrock embedding litellm.set_verbose = True diff --git a/litellm/tests/test_optional_params.py b/litellm/tests/test_optional_params.py index 54e2e5b43..3e7d1e5e5 100644 --- a/litellm/tests/test_optional_params.py +++ b/litellm/tests/test_optional_params.py @@ -70,13 +70,43 @@ def test_anthropic_optional_params(stop_sequence, expected_count): def test_bedrock_optional_params_embeddings(): litellm.drop_params = True optional_params = get_optional_params_embeddings( - user="John", encoding_format=None, custom_llm_provider="bedrock" + model="", user="John", encoding_format=None, custom_llm_provider="bedrock" ) assert len(optional_params) == 0 +@pytest.mark.parametrize( + "model, expected_dimensions, dimensions_kwarg", + [ + ("bedrock/amazon.titan-embed-text-v1", False, None), + ("bedrock/amazon.titan-embed-image-v1", True, "embeddingConfig"), + ("bedrock/amazon.titan-embed-text-v2:0", True, "dimensions"), + ("bedrock/cohere.embed-multilingual-v3", False, None), + ], +) +def test_bedrock_optional_params_embeddings_dimension( + model, expected_dimensions, dimensions_kwarg +): + litellm.drop_params = True + optional_params = get_optional_params_embeddings( + model=model, + user="John", + encoding_format=None, + dimensions=20, + custom_llm_provider="bedrock", + ) + if expected_dimensions: + assert len(optional_params) == 1 + else: + assert len(optional_params) == 0 + + if dimensions_kwarg is not None: + assert dimensions_kwarg in optional_params + + def test_google_ai_studio_optional_params_embeddings(): optional_params = get_optional_params_embeddings( + model="", user="John", encoding_format=None, custom_llm_provider="gemini", @@ -88,7 +118,7 @@ def test_google_ai_studio_optional_params_embeddings(): def test_openai_optional_params_embeddings(): litellm.drop_params = True optional_params = get_optional_params_embeddings( - user="John", encoding_format=None, custom_llm_provider="openai" + model="", user="John", encoding_format=None, custom_llm_provider="openai" ) assert len(optional_params) == 1 assert optional_params["user"] == "John" @@ -97,7 +127,10 @@ def test_openai_optional_params_embeddings(): def test_azure_optional_params_embeddings(): litellm.drop_params = True optional_params = get_optional_params_embeddings( - user="John", encoding_format=None, custom_llm_provider="azure" + model="chatgpt-v-2", + user="John", + encoding_format=None, + custom_llm_provider="azure", ) assert len(optional_params) == 1 assert optional_params["user"] == "John" @@ -455,6 +488,7 @@ def test_get_optional_params_image_gen(): def test_bedrock_optional_params_embeddings_provider_specific_params(): optional_params = get_optional_params_embeddings( + model="my-custom-model", custom_llm_provider="huggingface", wait_for_model=True, ) diff --git a/litellm/types/llms/anthropic.py b/litellm/types/llms/anthropic.py index f14aa20c7..7b856a284 100644 --- a/litellm/types/llms/anthropic.py +++ b/litellm/types/llms/anthropic.py @@ -287,3 +287,11 @@ class AnthropicResponse(BaseModel): usage: AnthropicResponseUsageBlock """Billing and rate-limit usage.""" + + +class AnthropicChatCompletionUsageBlock(TypedDict, total=False): + prompt_tokens: Required[int] + completion_tokens: Required[int] + total_tokens: Required[int] + cache_creation_input_tokens: int + cache_read_input_tokens: int diff --git a/litellm/utils.py b/litellm/utils.py index efd48e8ab..8dd18f450 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -2550,7 +2550,7 @@ def get_optional_params_image_gen( def get_optional_params_embeddings( # 2 optional params - model=None, + model: str, user=None, encoding_format=None, dimensions=None, @@ -2606,7 +2606,7 @@ def get_optional_params_embeddings( ): raise UnsupportedParamsError( status_code=500, - message=f"Setting dimensions is not supported for OpenAI `text-embedding-3` and later models. To drop it from the call, set `litellm.drop_params = True`.", + message="Setting dimensions is not supported for OpenAI `text-embedding-3` and later models. To drop it from the call, set `litellm.drop_params = True`.", ) if custom_llm_provider == "triton": keys = list(non_default_params.keys()) @@ -2641,39 +2641,57 @@ def get_optional_params_embeddings( ) final_params = {**optional_params, **kwargs} return final_params - if custom_llm_provider == "vertex_ai": - if len(non_default_params.keys()) > 0: - if litellm.drop_params is True: # drop the unsupported non-default values - keys = list(non_default_params.keys()) - for k in keys: - non_default_params.pop(k, None) - final_params = {**non_default_params, **kwargs} - return final_params - raise UnsupportedParamsError( - status_code=500, - message=f"Setting user/encoding format is not supported by {custom_llm_provider}. To drop it from the call, set `litellm.drop_params = True`.", - ) if custom_llm_provider == "bedrock": # if dimensions is in non_default_params -> pass it for model=bedrock/amazon.titan-embed-text-v2 - if ( - "dimensions" in non_default_params.keys() - and "amazon.titan-embed-text-v2" in model - ): - kwargs["dimensions"] = non_default_params["dimensions"] - non_default_params.pop("dimensions", None) + if "amazon.titan-embed-text-v1" in model: + object: Any = litellm.AmazonTitanG1Config() + elif "amazon.titan-embed-image-v1" in model: + object = litellm.AmazonTitanMultimodalEmbeddingG1Config() + elif "amazon.titan-embed-text-v2:0" in model: + object = litellm.AmazonTitanV2Config() + elif "cohere.embed-multilingual-v3" in model: + object = litellm.BedrockCohereEmbeddingConfig() + else: # unmapped model + supported_params = [] + _check_valid_arg(supported_params=supported_params) + final_params = {**kwargs} + return final_params - if len(non_default_params.keys()) > 0: - if litellm.drop_params is True: # drop the unsupported non-default values - keys = list(non_default_params.keys()) - for k in keys: - non_default_params.pop(k, None) - final_params = {**non_default_params, **kwargs} - return final_params - raise UnsupportedParamsError( - status_code=500, - message=f"Setting user/encoding format is not supported by {custom_llm_provider}. To drop it from the call, set `litellm.drop_params = True`.", - ) - return {**non_default_params, **kwargs} + supported_params = object.get_supported_openai_params() + _check_valid_arg(supported_params=supported_params) + optional_params = object.map_openai_params( + non_default_params=non_default_params, optional_params={} + ) + final_params = {**optional_params, **kwargs} + return final_params + # elif model == "amazon.titan-embed-image-v1": + # supported_params = litellm.AmazonTitanG1Config().get_supported_openai_params() + # _check_valid_arg(supported_params=supported_params) + # optional_params = litellm.AmazonTitanG1Config().map_openai_params( + # non_default_params=non_default_params, optional_params={} + # ) + # final_params = {**optional_params, **kwargs} + # return final_params + + # if ( + # "dimensions" in non_default_params.keys() + # and "amazon.titan-embed-text-v2" in model + # ): + # kwargs["dimensions"] = non_default_params["dimensions"] + # non_default_params.pop("dimensions", None) + + # if len(non_default_params.keys()) > 0: + # if litellm.drop_params is True: # drop the unsupported non-default values + # keys = list(non_default_params.keys()) + # for k in keys: + # non_default_params.pop(k, None) + # final_params = {**non_default_params, **kwargs} + # return final_params + # raise UnsupportedParamsError( + # status_code=500, + # message=f"Setting user/encoding format is not supported by {custom_llm_provider}. To drop it from the call, set `litellm.drop_params = True`.", + # ) + # return {**non_default_params, **kwargs} if custom_llm_provider == "mistral": supported_params = get_supported_openai_params( model=model, @@ -9888,11 +9906,7 @@ class CustomStreamWrapper: if anthropic_response_obj["usage"] is not None: model_response.usage = litellm.Usage( - prompt_tokens=anthropic_response_obj["usage"]["prompt_tokens"], - completion_tokens=anthropic_response_obj["usage"][ - "completion_tokens" - ], - total_tokens=anthropic_response_obj["usage"]["total_tokens"], + **anthropic_response_obj["usage"] ) if ( @@ -10507,10 +10521,10 @@ class CustomStreamWrapper: original_chunk.system_fingerprint ) print_verbose(f"self.sent_first_chunk: {self.sent_first_chunk}") - if self.sent_first_chunk == False: + if self.sent_first_chunk is False: model_response.choices[0].delta["role"] = "assistant" self.sent_first_chunk = True - elif self.sent_first_chunk == True and hasattr( + elif self.sent_first_chunk is True and hasattr( model_response.choices[0].delta, "role" ): _initial_delta = model_response.choices[ @@ -10575,7 +10589,7 @@ class CustomStreamWrapper: model_response.choices[0].delta.tool_calls is not None or model_response.choices[0].delta.function_call is not None ): - if self.sent_first_chunk == False: + if self.sent_first_chunk is False: model_response.choices[0].delta["role"] = "assistant" self.sent_first_chunk = True return model_response