From 2e5c46ef6dfe4821920c6983884d5cf3bb08f5f5 Mon Sep 17 00:00:00 2001 From: Krish Dholakia Date: Fri, 4 Oct 2024 21:28:53 -0400 Subject: [PATCH] LiteLLM Minor Fixes & Improvements (10/04/2024) (#6064) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix(litellm_logging.py): ensure cache hits are scrubbed if 'turn_off_message_logging' is enabled * fix(sagemaker.py): fix streaming to raise error immediately Fixes https://github.com/BerriAI/litellm/issues/6054 * (fixes) gcs bucket key based logging (#6044) * fixes for gcs bucket logging * fix StandardCallbackDynamicParams * fix - gcs logging when payload is not serializable * add test_add_callback_via_key_litellm_pre_call_utils_gcs_bucket * working success callbacks * linting fixes * fix linting error * add type hints to functions * fixes for dynamic success and failure logging * fix for test_async_chat_openai_stream * fix handle case when key based logging vars are set as os.environ/ vars * fix prometheus track cooldown events on custom logger (#6060) * (docs) add 1k rps load test doc (#6059) * docs 1k rps load test * docs load testing * docs load testing litellm * docs load testing * clean up load test doc * docs prom metrics for load testing * docs using prometheus on load testing * doc load testing with prometheus * (fixes) docs + qa - gcs key based logging (#6061) * fixes for required values for gcs bucket * docs gcs bucket logging * bump: version 1.48.12 → 1.48.13 * ci/cd run again * bump: version 1.48.13 → 1.48.14 * update load test doc * (docs) router settings - on litellm config (#6037) * add yaml with all router settings * add docs for router settings * docs router settings litellm settings * (feat) OpenAI prompt caching models to model cost map (#6063) * add prompt caching for latest models * add cache_read_input_token_cost for prompt caching models * fix(litellm_logging.py): check if param is iterable Fixes https://github.com/BerriAI/litellm/issues/6025#issuecomment-2393929946 * fix(factory.py): support passing an 'assistant_continue_message' to prevent bedrock error Fixes https://github.com/BerriAI/litellm/issues/6053 * fix(databricks/chat): handle streaming responses * fix(factory.py): fix linting error * fix(utils.py): unify anthropic + deepseek prompt caching information to openai format Fixes https://github.com/BerriAI/litellm/issues/6069 * test: fix test * fix(types/utils.py): support all openai roles Fixes https://github.com/BerriAI/litellm/issues/6052 * test: fix test --------- Co-authored-by: Ishaan Jaff --- .../docs/completion/prompt_caching.md | 199 ++++++++++++ docs/my-website/docs/completion/usage.md | 124 -------- docs/my-website/sidebars.js | 1 + .../exception_mapping_utils.py | 100 +++++- litellm/litellm_core_utils/litellm_logging.py | 41 ++- litellm/litellm_core_utils/redact_messages.py | 33 +- litellm/llms/anthropic/chat/handler.py | 19 +- litellm/llms/databricks/chat.py | 68 +++-- litellm/llms/databricks/streaming_utils.py | 10 +- litellm/llms/prompt_templates/factory.py | 135 ++++++--- litellm/llms/sagemaker/sagemaker.py | 15 +- litellm/proxy/_new_secret_config.yaml | 79 ++++- litellm/types/utils.py | 28 +- .../local_testing/test_bedrock_completion.py | 285 ++++++++++++++++++ tests/local_testing/test_completion_cost.py | 4 +- .../test_custom_callback_input.py | 17 +- tests/local_testing/test_prompt_caching.py | 81 +++++ tests/local_testing/test_sagemaker.py | 34 +++ tests/local_testing/test_utils.py | 20 +- 19 files changed, 1034 insertions(+), 259 deletions(-) create mode 100644 docs/my-website/docs/completion/prompt_caching.md create mode 100644 tests/local_testing/test_prompt_caching.py diff --git a/docs/my-website/docs/completion/prompt_caching.md b/docs/my-website/docs/completion/prompt_caching.md new file mode 100644 index 000000000..3a5537b52 --- /dev/null +++ b/docs/my-website/docs/completion/prompt_caching.md @@ -0,0 +1,199 @@ +# Prompt Caching + +For OpenAI + Anthropic + Deepseek, LiteLLM follows the OpenAI prompt caching usage object format: + +```bash +"usage": { + "prompt_tokens": 2006, + "completion_tokens": 300, + "total_tokens": 2306, + "prompt_tokens_details": { + "cached_tokens": 1920 + }, + "completion_tokens_details": { + "reasoning_tokens": 0 + } + # ANTHROPIC_ONLY # + "cache_creation_input_tokens": 0 +} +``` + +- `prompt_tokens`: These are the non-cached prompt tokens (same as Anthropic, equivalent to Deepseek `prompt_cache_miss_tokens`). +- `completion_tokens`: These are the output tokens generated by the model. +- `total_tokens`: Sum of prompt_tokens + completion_tokens. +- `prompt_tokens_details`: Object containing cached_tokens. + - `cached_tokens`: Tokens that were a cache-hit for that call. +- `completion_tokens_details`: Object containing reasoning_tokens. +- **ANTHROPIC_ONLY**: `cache_creation_input_tokens` are the number of tokens that were written to cache. (Anthropic charges for this). + +## Quick Start + +Note: OpenAI caching is only available for prompts containing 1024 tokens or more + +```python +from litellm import completion +import os + +os.environ["OPENAI_API_KEY"] = "" + +for _ in range(2): + response = completion( + model="gpt-4o", + messages=[ + # System Message + { + "role": "system", + "content": [ + { + "type": "text", + "text": "Here is the full text of a complex legal agreement" + * 400, + } + ], + }, + # marked for caching with the cache_control parameter, so that this checkpoint can read from the previous cache. + { + "role": "user", + "content": [ + { + "type": "text", + "text": "What are the key terms and conditions in this agreement?", + } + ], + }, + { + "role": "assistant", + "content": "Certainly! the key terms and conditions are the following: the contract is 1 year long for $10/mo", + }, + # The final turn is marked with cache-control, for continuing in followups. + { + "role": "user", + "content": [ + { + "type": "text", + "text": "What are the key terms and conditions in this agreement?", + } + ], + }, + ], + temperature=0.2, + max_tokens=10, + ) + +print("response=", response) +print("response.usage=", response.usage) + +assert "prompt_tokens_details" in response.usage +assert response.usage.prompt_tokens_details.cached_tokens > 0 +``` + +### Anthropic Example + +Anthropic charges for cache writes. + +Specify the content to cache with `"cache_control": {"type": "ephemeral"}`. + +If you pass that in for any other llm provider, it will be ignored. + +```python +from litellm import completion +import litellm +import os + +litellm.set_verbose = True # 👈 SEE RAW REQUEST +os.environ["ANTHROPIC_API_KEY"] = "" + +response = completion( + model="anthropic/claude-3-5-sonnet-20240620", + messages=[ + { + "role": "system", + "content": [ + { + "type": "text", + "text": "You are an AI assistant tasked with analyzing legal documents.", + }, + { + "type": "text", + "text": "Here is the full text of a complex legal agreement" * 400, + "cache_control": {"type": "ephemeral"}, + }, + ], + }, + { + "role": "user", + "content": "what are the key terms and conditions in this agreement?", + }, + ] +) + +print(response.usage) +``` + +### Deepeek Example + +Works the same as OpenAI. + +```python +from litellm import completion +import litellm +import os + +os.environ["DEEPSEEK_API_KEY"] = "" + +litellm.set_verbose = True # 👈 SEE RAW REQUEST + +model_name = "deepseek/deepseek-chat" +messages_1 = [ + { + "role": "system", + "content": "You are a history expert. The user will provide a series of questions, and your answers should be concise and start with `Answer:`", + }, + { + "role": "user", + "content": "In what year did Qin Shi Huang unify the six states?", + }, + {"role": "assistant", "content": "Answer: 221 BC"}, + {"role": "user", "content": "Who was the founder of the Han Dynasty?"}, + {"role": "assistant", "content": "Answer: Liu Bang"}, + {"role": "user", "content": "Who was the last emperor of the Tang Dynasty?"}, + {"role": "assistant", "content": "Answer: Li Zhu"}, + { + "role": "user", + "content": "Who was the founding emperor of the Ming Dynasty?", + }, + {"role": "assistant", "content": "Answer: Zhu Yuanzhang"}, + { + "role": "user", + "content": "Who was the founding emperor of the Qing Dynasty?", + }, +] + +message_2 = [ + { + "role": "system", + "content": "You are a history expert. The user will provide a series of questions, and your answers should be concise and start with `Answer:`", + }, + { + "role": "user", + "content": "In what year did Qin Shi Huang unify the six states?", + }, + {"role": "assistant", "content": "Answer: 221 BC"}, + {"role": "user", "content": "Who was the founder of the Han Dynasty?"}, + {"role": "assistant", "content": "Answer: Liu Bang"}, + {"role": "user", "content": "Who was the last emperor of the Tang Dynasty?"}, + {"role": "assistant", "content": "Answer: Li Zhu"}, + { + "role": "user", + "content": "Who was the founding emperor of the Ming Dynasty?", + }, + {"role": "assistant", "content": "Answer: Zhu Yuanzhang"}, + {"role": "user", "content": "When did the Shang Dynasty fall?"}, +] + +response_1 = litellm.completion(model=model_name, messages=messages_1) +response_2 = litellm.completion(model=model_name, messages=message_2) + +# Add any assertions here to check the response +print(response_2.usage) +``` \ No newline at end of file diff --git a/docs/my-website/docs/completion/usage.md b/docs/my-website/docs/completion/usage.md index d9f5aeda4..2a9eab941 100644 --- a/docs/my-website/docs/completion/usage.md +++ b/docs/my-website/docs/completion/usage.md @@ -49,127 +49,3 @@ for chunk in completion: print(chunk.choices[0].delta) ``` - -## Prompt Caching - -For Anthropic + Deepseek, LiteLLM follows the Anthropic prompt caching usage object format: - -```bash -"usage": { - "prompt_tokens": int, - "completion_tokens": int, - "total_tokens": int, - "_cache_creation_input_tokens": int, # hidden param for prompt caching. Might change, once openai introduces their equivalent. - "_cache_read_input_tokens": int # hidden param for prompt caching. Might change, once openai introduces their equivalent. -} -``` - -- `prompt_tokens`: These are the non-cached prompt tokens (same as Anthropic, equivalent to Deepseek `prompt_cache_miss_tokens`). -- `completion_tokens`: These are the output tokens generated by the model. -- `total_tokens`: Sum of prompt_tokens + completion_tokens. -- `_cache_creation_input_tokens`: Input tokens that were written to cache. (Anthropic only). -- `_cache_read_input_tokens`: Input tokens that were read from cache for that call. (equivalent to Deepseek `prompt_cache_hit_tokens`). - - -### Anthropic Example - -```python -from litellm import completion -import litellm -import os - -litellm.set_verbose = True # 👈 SEE RAW REQUEST -os.environ["ANTHROPIC_API_KEY"] = "" - -response = completion( - model="anthropic/claude-3-5-sonnet-20240620", - messages=[ - { - "role": "system", - "content": [ - { - "type": "text", - "text": "You are an AI assistant tasked with analyzing legal documents.", - }, - { - "type": "text", - "text": "Here is the full text of a complex legal agreement" * 400, - "cache_control": {"type": "ephemeral"}, - }, - ], - }, - { - "role": "user", - "content": "what are the key terms and conditions in this agreement?", - }, - ] -) - -print(response.usage) -``` - -### Deepeek Example - -```python -from litellm import completion -import litellm -import os - -os.environ["DEEPSEEK_API_KEY"] = "" - -litellm.set_verbose = True # 👈 SEE RAW REQUEST - -model_name = "deepseek/deepseek-chat" -messages_1 = [ - { - "role": "system", - "content": "You are a history expert. The user will provide a series of questions, and your answers should be concise and start with `Answer:`", - }, - { - "role": "user", - "content": "In what year did Qin Shi Huang unify the six states?", - }, - {"role": "assistant", "content": "Answer: 221 BC"}, - {"role": "user", "content": "Who was the founder of the Han Dynasty?"}, - {"role": "assistant", "content": "Answer: Liu Bang"}, - {"role": "user", "content": "Who was the last emperor of the Tang Dynasty?"}, - {"role": "assistant", "content": "Answer: Li Zhu"}, - { - "role": "user", - "content": "Who was the founding emperor of the Ming Dynasty?", - }, - {"role": "assistant", "content": "Answer: Zhu Yuanzhang"}, - { - "role": "user", - "content": "Who was the founding emperor of the Qing Dynasty?", - }, -] - -message_2 = [ - { - "role": "system", - "content": "You are a history expert. The user will provide a series of questions, and your answers should be concise and start with `Answer:`", - }, - { - "role": "user", - "content": "In what year did Qin Shi Huang unify the six states?", - }, - {"role": "assistant", "content": "Answer: 221 BC"}, - {"role": "user", "content": "Who was the founder of the Han Dynasty?"}, - {"role": "assistant", "content": "Answer: Liu Bang"}, - {"role": "user", "content": "Who was the last emperor of the Tang Dynasty?"}, - {"role": "assistant", "content": "Answer: Li Zhu"}, - { - "role": "user", - "content": "Who was the founding emperor of the Ming Dynasty?", - }, - {"role": "assistant", "content": "Answer: Zhu Yuanzhang"}, - {"role": "user", "content": "When did the Shang Dynasty fall?"}, -] - -response_1 = litellm.completion(model=model_name, messages=messages_1) -response_2 = litellm.completion(model=model_name, messages=message_2) - -# Add any assertions here to check the response -print(response_2.usage) -``` \ No newline at end of file diff --git a/docs/my-website/sidebars.js b/docs/my-website/sidebars.js index 1fe5d6364..6c58c5002 100644 --- a/docs/my-website/sidebars.js +++ b/docs/my-website/sidebars.js @@ -198,6 +198,7 @@ const sidebars = { "completion/drop_params", "completion/prompt_formatting", "completion/output", + "completion/prompt_caching", "completion/usage", "exception_mapping", "completion/stream", diff --git a/litellm/litellm_core_utils/exception_mapping_utils.py b/litellm/litellm_core_utils/exception_mapping_utils.py index 87596a0b4..61cca6e07 100644 --- a/litellm/litellm_core_utils/exception_mapping_utils.py +++ b/litellm/litellm_core_utils/exception_mapping_utils.py @@ -750,6 +750,17 @@ def exception_type( # type: ignore model=model, llm_provider="bedrock", ) + elif ( + "Conversation blocks and tool result blocks cannot be provided in the same turn." + in error_str + ): + exception_mapping_worked = True + raise BadRequestError( + message=f"BedrockException - {error_str}\n. Enable 'litellm.modify_params=True' (for PROXY do: `litellm_settings::modify_params: True`) to insert a dummy assistant message and fix this error.", + model=model, + llm_provider="bedrock", + response=original_exception.response, + ) elif "Malformed input request" in error_str: exception_mapping_worked = True raise BadRequestError( @@ -895,7 +906,10 @@ def exception_type( # type: ignore llm_provider=custom_llm_provider, litellm_debug_info=extra_information, ) - elif custom_llm_provider == "sagemaker": + elif ( + custom_llm_provider == "sagemaker" + or custom_llm_provider == "sagemaker_chat" + ): if "Unable to locate credentials" in error_str: exception_mapping_worked = True raise BadRequestError( @@ -926,6 +940,90 @@ def exception_type( # type: ignore llm_provider="sagemaker", response=original_exception.response, ) + elif hasattr(original_exception, "status_code"): + if original_exception.status_code == 500: + exception_mapping_worked = True + raise ServiceUnavailableError( + message=f"SagemakerException - {original_exception.message}", + llm_provider=custom_llm_provider, + model=model, + response=httpx.Response( + status_code=500, + request=httpx.Request( + method="POST", url="https://api.openai.com/v1/" + ), + ), + ) + elif original_exception.status_code == 401: + exception_mapping_worked = True + raise AuthenticationError( + message=f"SagemakerException - {original_exception.message}", + llm_provider=custom_llm_provider, + model=model, + response=original_exception.response, + ) + elif original_exception.status_code == 400: + exception_mapping_worked = True + raise BadRequestError( + message=f"SagemakerException - {original_exception.message}", + llm_provider=custom_llm_provider, + model=model, + response=original_exception.response, + ) + elif original_exception.status_code == 404: + exception_mapping_worked = True + raise NotFoundError( + message=f"SagemakerException - {original_exception.message}", + llm_provider=custom_llm_provider, + model=model, + response=original_exception.response, + ) + elif original_exception.status_code == 408: + exception_mapping_worked = True + raise Timeout( + message=f"SagemakerException - {original_exception.message}", + model=model, + llm_provider=custom_llm_provider, + litellm_debug_info=extra_information, + ) + elif ( + original_exception.status_code == 422 + or original_exception.status_code == 424 + ): + exception_mapping_worked = True + raise BadRequestError( + message=f"SagemakerException - {original_exception.message}", + model=model, + llm_provider=custom_llm_provider, + response=original_exception.response, + litellm_debug_info=extra_information, + ) + elif original_exception.status_code == 429: + exception_mapping_worked = True + raise RateLimitError( + message=f"SagemakerException - {original_exception.message}", + model=model, + llm_provider=custom_llm_provider, + response=original_exception.response, + litellm_debug_info=extra_information, + ) + elif original_exception.status_code == 503: + exception_mapping_worked = True + raise ServiceUnavailableError( + message=f"SagemakerException - {original_exception.message}", + model=model, + llm_provider=custom_llm_provider, + response=original_exception.response, + litellm_debug_info=extra_information, + ) + elif original_exception.status_code == 504: # gateway timeout error + exception_mapping_worked = True + raise Timeout( + message=f"SagemakerException - {original_exception.message}", + model=model, + llm_provider=custom_llm_provider, + litellm_debug_info=extra_information, + ) elif ( custom_llm_provider == "vertex_ai" or custom_llm_provider == "vertex_ai_beta" diff --git a/litellm/litellm_core_utils/litellm_logging.py b/litellm/litellm_core_utils/litellm_logging.py index 1915176e1..e040bfb40 100644 --- a/litellm/litellm_core_utils/litellm_logging.py +++ b/litellm/litellm_core_utils/litellm_logging.py @@ -363,7 +363,7 @@ class Logging: for param in _supported_callback_params: if param in kwargs: _param_value = kwargs.pop(param) - if "os.environ/" in _param_value: + if _param_value is not None and "os.environ/" in _param_value: _param_value = get_secret_str(secret_name=_param_value) standard_callback_dynamic_params[param] = _param_value # type: ignore return standard_callback_dynamic_params @@ -632,7 +632,12 @@ class Logging: ) ) original_response = redact_message_input_output_from_logging( - litellm_logging_obj=self, result=original_response + model_call_details=( + self.model_call_details + if hasattr(self, "model_call_details") + else {} + ), + result=original_response, ) # Input Integration Logging -> If you want to log the fact that an attempt to call the model was made @@ -933,7 +938,12 @@ class Logging: callbacks = litellm.success_callback result = redact_message_input_output_from_logging( - result=result, litellm_logging_obj=self + model_call_details=( + self.model_call_details + if hasattr(self, "model_call_details") + else {} + ), + result=result, ) ## LOGGING HOOK ## @@ -1591,7 +1601,10 @@ class Logging: callbacks = litellm._async_success_callback result = redact_message_input_output_from_logging( - result=result, litellm_logging_obj=self + model_call_details=( + self.model_call_details if hasattr(self, "model_call_details") else {} + ), + result=result, ) ## LOGGING HOOK ## @@ -1899,7 +1912,12 @@ class Logging: result = None # result sent to all loggers, init this to None incase it's not created result = redact_message_input_output_from_logging( - result=result, litellm_logging_obj=self + model_call_details=( + self.model_call_details + if hasattr(self, "model_call_details") + else {} + ), + result=result, ) for callback in callbacks: try: @@ -2747,8 +2765,17 @@ def get_standard_logging_object_payload( else: final_response_obj = None - if litellm.turn_off_message_logging: - final_response_obj = "redacted-by-litellm" + modified_final_response_obj = redact_message_input_output_from_logging( + model_call_details=kwargs, + result=final_response_obj, + ) + + if modified_final_response_obj is not None and isinstance( + modified_final_response_obj, BaseModel + ): + final_response_obj = modified_final_response_obj.model_dump() + else: + final_response_obj = modified_final_response_obj payload: StandardLoggingPayload = StandardLoggingPayload( id=str(id), diff --git a/litellm/litellm_core_utils/redact_messages.py b/litellm/litellm_core_utils/redact_messages.py index 631810aaa..8dad71439 100644 --- a/litellm/litellm_core_utils/redact_messages.py +++ b/litellm/litellm_core_utils/redact_messages.py @@ -8,7 +8,7 @@ # Thank you users! We ❤️ you! - Krrish & Ishaan import copy -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, Optional import litellm from litellm.integrations.custom_logger import CustomLogger @@ -30,29 +30,27 @@ def redact_message_input_output_from_custom_logger( hasattr(custom_logger, "message_logging") and custom_logger.message_logging is not True ): - return perform_redaction(litellm_logging_obj, result) + return perform_redaction(litellm_logging_obj.model_call_details, result) return result -def perform_redaction(litellm_logging_obj: LiteLLMLoggingObject, result): +def perform_redaction(model_call_details: dict, result): """ Performs the actual redaction on the logging object and result. """ # Redact model_call_details - litellm_logging_obj.model_call_details["messages"] = [ + model_call_details["messages"] = [ {"role": "user", "content": "redacted-by-litellm"} ] - litellm_logging_obj.model_call_details["prompt"] = "" - litellm_logging_obj.model_call_details["input"] = "" + model_call_details["prompt"] = "" + model_call_details["input"] = "" # Redact streaming response if ( - litellm_logging_obj.stream is True - and "complete_streaming_response" in litellm_logging_obj.model_call_details + model_call_details.get("stream", False) is True + and "complete_streaming_response" in model_call_details ): - _streaming_response = litellm_logging_obj.model_call_details[ - "complete_streaming_response" - ] + _streaming_response = model_call_details["complete_streaming_response"] for choice in _streaming_response.choices: if isinstance(choice, litellm.Choices): choice.message.content = "redacted-by-litellm" @@ -69,22 +67,19 @@ def perform_redaction(litellm_logging_obj: LiteLLMLoggingObject, result): elif isinstance(choice, litellm.utils.StreamingChoices): choice.delta.content = "redacted-by-litellm" return _result - - return result + else: + return "redacted-by-litellm" def redact_message_input_output_from_logging( - litellm_logging_obj: LiteLLMLoggingObject, result + model_call_details: dict, result, input: Optional[Any] = None ): """ Removes messages, prompts, input, response from logging. This modifies the data in-place only redacts when litellm.turn_off_message_logging == True """ _request_headers = ( - litellm_logging_obj.model_call_details.get("litellm_params", {}).get( - "metadata", {} - ) - or {} + model_call_details.get("litellm_params", {}).get("metadata", {}) or {} ) request_headers = _request_headers.get("headers", {}) @@ -101,7 +96,7 @@ def redact_message_input_output_from_logging( ): return result - return perform_redaction(litellm_logging_obj, result) + return perform_redaction(model_call_details, result) def redact_user_api_key_info(metadata: dict) -> dict: diff --git a/litellm/llms/anthropic/chat/handler.py b/litellm/llms/anthropic/chat/handler.py index d5ad4d858..bd2a76f68 100644 --- a/litellm/llms/anthropic/chat/handler.py +++ b/litellm/llms/anthropic/chat/handler.py @@ -43,7 +43,7 @@ from litellm.types.llms.openai import ( ChatCompletionToolCallFunctionChunk, ChatCompletionUsageBlock, ) -from litellm.types.utils import GenericStreamingChunk +from litellm.types.utils import GenericStreamingChunk, PromptTokensDetails from litellm.utils import CustomStreamWrapper, ModelResponse, Usage from ...base import BaseLLM @@ -283,19 +283,28 @@ class AnthropicChatCompletion(BaseLLM): completion_tokens = completion_response["usage"]["output_tokens"] _usage = completion_response["usage"] total_tokens = prompt_tokens + completion_tokens + cache_creation_input_tokens: int = 0 + cache_read_input_tokens: int = 0 model_response.created = int(time.time()) model_response.model = model + if "cache_creation_input_tokens" in _usage: + cache_creation_input_tokens = _usage["cache_creation_input_tokens"] + if "cache_read_input_tokens" in _usage: + cache_read_input_tokens = _usage["cache_read_input_tokens"] + + prompt_tokens_details = PromptTokensDetails( + cached_tokens=cache_read_input_tokens + ) usage = Usage( prompt_tokens=prompt_tokens, completion_tokens=completion_tokens, total_tokens=total_tokens, + prompt_tokens_details=prompt_tokens_details, + cache_creation_input_tokens=cache_creation_input_tokens, + cache_read_input_tokens=cache_read_input_tokens, ) - if "cache_creation_input_tokens" in _usage: - usage["cache_creation_input_tokens"] = _usage["cache_creation_input_tokens"] - if "cache_read_input_tokens" in _usage: - usage["cache_read_input_tokens"] = _usage["cache_read_input_tokens"] setattr(model_response, "usage", usage) # type: ignore model_response._hidden_params = _hidden_params diff --git a/litellm/llms/databricks/chat.py b/litellm/llms/databricks/chat.py index 0b9898b08..eb0cb341e 100644 --- a/litellm/llms/databricks/chat.py +++ b/litellm/llms/databricks/chat.py @@ -14,7 +14,11 @@ import requests # type: ignore import litellm from litellm.litellm_core_utils.core_helpers import map_finish_reason -from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler +from litellm.llms.custom_httpx.http_handler import ( + AsyncHTTPHandler, + HTTPHandler, + get_async_httpx_client, +) from litellm.llms.databricks.exceptions import DatabricksError from litellm.llms.databricks.streaming_utils import ModelResponseIterator from litellm.types.llms.openai import ( @@ -167,7 +171,7 @@ class DatabricksEmbeddingConfig: async def make_call( - client: AsyncHTTPHandler, + client: Optional[AsyncHTTPHandler], api_base: str, headers: dict, data: str, @@ -176,6 +180,10 @@ async def make_call( logging_obj, streaming_decoder: Optional[CustomStreamingDecoder] = None, ): + if client is None: + client = get_async_httpx_client( + llm_provider=litellm.LlmProviders.DATABRICKS + ) # Create a new client if none provided response = await client.post(api_base, headers=headers, data=data, stream=True) if response.status_code != 200: @@ -211,7 +219,7 @@ def make_sync_call( streaming_decoder: Optional[CustomStreamingDecoder] = None, ): if client is None: - client = HTTPHandler() # Create a new client if none provided + client = litellm.module_level_client # Create a new client if none provided response = client.post(api_base, headers=headers, data=data, stream=True) @@ -343,18 +351,18 @@ class DatabricksChatCompletion(BaseLLM): ) -> CustomStreamWrapper: data["stream"] = True + completion_stream = await make_call( + client=client, + api_base=api_base, + headers=headers, + data=json.dumps(data), + model=model, + messages=messages, + logging_obj=logging_obj, + streaming_decoder=streaming_decoder, + ) streamwrapper = CustomStreamWrapper( - completion_stream=None, - make_call=partial( - make_call, - api_base=api_base, - headers=headers, - data=json.dumps(data), - model=model, - messages=messages, - logging_obj=logging_obj, - streaming_decoder=streaming_decoder, - ), + completion_stream=completion_stream, model=model, custom_llm_provider=custom_llm_provider, logging_obj=logging_obj, @@ -530,28 +538,32 @@ class DatabricksChatCompletion(BaseLLM): base_model=base_model, ) else: - if client is None or not isinstance(client, HTTPHandler): - client = HTTPHandler(timeout=timeout) # type: ignore ## COMPLETION CALL if stream is True: - return CustomStreamWrapper( - completion_stream=None, - make_call=partial( - make_sync_call, - client=None, - api_base=api_base, - headers=headers, # type: ignore - data=json.dumps(data), - model=model, - messages=messages, - logging_obj=logging_obj, - streaming_decoder=streaming_decoder, + completion_stream = make_sync_call( + client=( + client + if client is not None and isinstance(client, HTTPHandler) + else None ), + api_base=api_base, + headers=headers, + data=json.dumps(data), + model=model, + messages=messages, + logging_obj=logging_obj, + streaming_decoder=streaming_decoder, + ) + # completion_stream.__iter__() + return CustomStreamWrapper( + completion_stream=completion_stream, model=model, custom_llm_provider=custom_llm_provider, logging_obj=logging_obj, ) else: + if client is None or not isinstance(client, HTTPHandler): + client = HTTPHandler(timeout=timeout) # type: ignore try: response = client.post( api_base, headers=headers, data=json.dumps(data) diff --git a/litellm/llms/databricks/streaming_utils.py b/litellm/llms/databricks/streaming_utils.py index 1b342f3c9..dd6b3c8aa 100644 --- a/litellm/llms/databricks/streaming_utils.py +++ b/litellm/llms/databricks/streaming_utils.py @@ -54,10 +54,10 @@ class ModelResponseIterator: is_finished = True finish_reason = processed_chunk.choices[0].finish_reason - if hasattr(processed_chunk, "usage") and isinstance( - processed_chunk.usage, litellm.Usage - ): - usage_chunk: litellm.Usage = processed_chunk.usage + usage_chunk: Optional[litellm.Usage] = getattr( + processed_chunk, "usage", None + ) + if usage_chunk is not None: usage = ChatCompletionUsageBlock( prompt_tokens=usage_chunk.prompt_tokens, @@ -82,6 +82,8 @@ class ModelResponseIterator: return self def __next__(self): + if not hasattr(self, "response_iterator"): + self.response_iterator = self.streaming_response try: chunk = self.response_iterator.__next__() except StopIteration: diff --git a/litellm/llms/prompt_templates/factory.py b/litellm/llms/prompt_templates/factory.py index 0ea527ccf..2a9453031 100644 --- a/litellm/llms/prompt_templates/factory.py +++ b/litellm/llms/prompt_templates/factory.py @@ -62,7 +62,11 @@ DEFAULT_USER_CONTINUE_MESSAGE = { # used to interweave assistant messages, to ensure user/assistant alternating DEFAULT_ASSISTANT_CONTINUE_MESSAGE = { "role": "assistant", - "content": "Please continue.", + "content": [ + { + "text": "Please continue.", + } + ], } # similar to autogen. Only used if `litellm.modify_params=True`. @@ -2354,17 +2358,40 @@ def _convert_to_bedrock_tool_call_result( return content_block +def _insert_assistant_continue_message( + messages: List[BedrockMessageBlock], + assistant_continue_message: Optional[str] = None, +) -> List[BedrockMessageBlock]: + """ + Add dummy message between user/tool result blocks. + + Conversation blocks and tool result blocks cannot be provided in the same turn. Issue: https://github.com/BerriAI/litellm/issues/6053 + """ + if assistant_continue_message is not None: + messages.append( + BedrockMessageBlock( + role="assistant", + content=[BedrockContentBlock(text=assistant_continue_message)], + ) + ) + elif litellm.modify_params: + messages.append(BedrockMessageBlock(**DEFAULT_ASSISTANT_CONTINUE_MESSAGE)) # type: ignore + return messages + + def _bedrock_converse_messages_pt( messages: List, model: str, llm_provider: str, user_continue_message: Optional[dict] = None, + assistant_continue_message: Optional[str] = None, ) -> List[BedrockMessageBlock]: """ Converts given messages from OpenAI format to Bedrock format - Roles must alternate b/w 'user' and 'model' (same as anthropic -> merge consecutive roles) - Please ensure that function response turn comes immediately after a function call turn + - Conversation blocks and tool result blocks cannot be provided in the same turn. Issue: https://github.com/BerriAI/litellm/issues/6053 """ contents: List[BedrockMessageBlock] = [] @@ -2387,37 +2414,78 @@ def _bedrock_converse_messages_pt( while msg_i < len(messages): user_content: List[BedrockContentBlock] = [] init_msg_i = msg_i - valid_user_roles = ["user", "tool", "function"] - ## MERGE CONSECUTIVE USER + TOOL CONTENT ## - while msg_i < len(messages) and messages[msg_i]["role"] in valid_user_roles: - if messages[msg_i]["role"] == "user": - if isinstance(messages[msg_i]["content"], list): - _parts: List[BedrockContentBlock] = [] - for element in messages[msg_i]["content"]: - if isinstance(element, dict): - if element["type"] == "text": - _part = BedrockContentBlock(text=element["text"]) - _parts.append(_part) - elif element["type"] == "image_url": - image_url = element["image_url"]["url"] - _part = _process_bedrock_converse_image_block( # type: ignore - image_url=image_url - ) - _parts.append(BedrockContentBlock(image=_part)) # type: ignore - user_content.extend(_parts) - elif isinstance(messages[msg_i]["content"], str): - _part = BedrockContentBlock(text=messages[msg_i]["content"]) - user_content.append(_part) - elif ( - messages[msg_i]["role"] == "tool" - or messages[msg_i]["role"] == "function" - ): - tool_call_result = _convert_to_bedrock_tool_call_result(messages[msg_i]) - user_content.append(tool_call_result) - msg_i += 1 + ## MERGE CONSECUTIVE USER CONTENT ## + while msg_i < len(messages) and messages[msg_i]["role"] == "user": + if isinstance(messages[msg_i]["content"], list): + _parts: List[BedrockContentBlock] = [] + for element in messages[msg_i]["content"]: + if isinstance(element, dict): + if element["type"] == "text": + _part = BedrockContentBlock(text=element["text"]) + _parts.append(_part) + elif element["type"] == "image_url": + image_url = element["image_url"]["url"] + _part = _process_bedrock_converse_image_block( # type: ignore + image_url=image_url + ) + _parts.append(BedrockContentBlock(image=_part)) # type: ignore + user_content.extend(_parts) + else: + _part = BedrockContentBlock(text=messages[msg_i]["content"]) + user_content.append(_part) + msg_i += 1 if user_content: - contents.append(BedrockMessageBlock(role="user", content=user_content)) + if len(contents) > 0 and contents[-1]["role"] == "user": + if ( + assistant_continue_message is not None + or litellm.modify_params is True + ): + # if last message was a 'user' message, then add a dummy assistant message (bedrock requires alternating roles) + contents = _insert_assistant_continue_message( + messages=contents, + assistant_continue_message=assistant_continue_message, + ) + contents.append( + BedrockMessageBlock(role="user", content=user_content) + ) + else: + verbose_logger.warning( + "Potential consecutive user/tool blocks. Trying to merge. If error occurs, please set a 'assistant_continue_message' or set 'modify_params=True' to insert a dummy assistant message for bedrock calls." + ) + contents[-1]["content"].extend(user_content) + else: + contents.append(BedrockMessageBlock(role="user", content=user_content)) + + ## MERGE CONSECUTIVE TOOL CALL MESSAGES ## + tool_content: List[BedrockContentBlock] = [] + while msg_i < len(messages) and messages[msg_i]["role"] == "tool": + tool_call_result = _convert_to_bedrock_tool_call_result(messages[msg_i]) + + tool_content.append(tool_call_result) + msg_i += 1 + if tool_content: + # if last message was a 'user' message, then add a blank assistant message (bedrock requires alternating roles) + if len(contents) > 0 and contents[-1]["role"] == "user": + if ( + assistant_continue_message is not None + or litellm.modify_params is True + ): + # if last message was a 'user' message, then add a dummy assistant message (bedrock requires alternating roles) + contents = _insert_assistant_continue_message( + messages=contents, + assistant_continue_message=assistant_continue_message, + ) + contents.append( + BedrockMessageBlock(role="user", content=tool_content) + ) + else: + verbose_logger.warning( + "Potential consecutive user/tool blocks. Trying to merge. If error occurs, please set a 'assistant_continue_message' or set 'modify_params=True' to insert a dummy assistant message for bedrock calls." + ) + contents[-1]["content"].extend(tool_content) + else: + contents.append(BedrockMessageBlock(role="user", content=tool_content)) assistant_content: List[BedrockContentBlock] = [] ## MERGE CONSECUTIVE ASSISTANT CONTENT ## while msg_i < len(messages) and messages[msg_i]["role"] == "assistant": @@ -2555,10 +2623,9 @@ def _bedrock_tools_pt(tools: List) -> List[BedrockToolBlock]: """ tool_block_list: List[BedrockToolBlock] = [] for tool in tools: - parameters = tool.get("function", {}).get("parameters", { - "type": "object", - "properties": {} - }) + parameters = tool.get("function", {}).get( + "parameters", {"type": "object", "properties": {}} + ) name = tool.get("function", {}).get("name", "") # related issue: https://github.com/BerriAI/litellm/issues/5007 diff --git a/litellm/llms/sagemaker/sagemaker.py b/litellm/llms/sagemaker/sagemaker.py index ec8845b9e..27023ab3f 100644 --- a/litellm/llms/sagemaker/sagemaker.py +++ b/litellm/llms/sagemaker/sagemaker.py @@ -633,15 +633,14 @@ class SagemakerLLM(BaseAWSLLM): "aws_region_name": aws_region_name, } prepared_request = await asyncified_prepare_request(**prepared_request_args) + completion_stream = await self.make_async_call( + api_base=prepared_request.url, + headers=prepared_request.headers, # type: ignore + data=data, + logging_obj=logging_obj, + ) streaming_response = CustomStreamWrapper( - completion_stream=None, - make_call=partial( - self.make_async_call, - api_base=prepared_request.url, - headers=prepared_request.headers, # type: ignore - data=data, - logging_obj=logging_obj, - ), + completion_stream=completion_stream, model=model, custom_llm_provider="sagemaker", logging_obj=logging_obj, diff --git a/litellm/proxy/_new_secret_config.yaml b/litellm/proxy/_new_secret_config.yaml index f5df0a282..f08f15435 100644 --- a/litellm/proxy/_new_secret_config.yaml +++ b/litellm/proxy/_new_secret_config.yaml @@ -1,12 +1,73 @@ model_list: - - model_name: gpt-4o + - model_name: fake-claude-endpoint litellm_params: - model: azure/gpt-4o-realtime-preview - api_key: os.environ/AZURE_SWEDEN_API_KEY - api_base: os.environ/AZURE_SWEDEN_API_BASE + model: anthropic.claude-3-sonnet-20240229-v1:0 + api_base: https://exampleopenaiendpoint-production.up.railway.app + aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY + aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID + - model_name: gemini-vision + litellm_params: + model: vertex_ai/gemini-1.0-pro-vision-001 + api_base: https://exampleopenaiendpoint-production.up.railway.app/v1/projects/adroit-crow-413218/locations/us-central1/publishers/google/models/gemini-1.0-pro-vision-001 + vertex_project: "adroit-crow-413218" + vertex_location: "us-central1" + - model_name: fake-azure-endpoint + litellm_params: + model: openai/429 + api_key: fake-key + api_base: https://exampleopenaiendpoint-production.up.railway.app + - model_name: fake-openai-endpoint + litellm_params: + model: gpt-3.5-turbo + api_base: https://exampleopenaiendpoint-production.up.railway.app + - model_name: o1-preview + litellm_params: + model: o1-preview + - model_name: rerank-english-v3.0 + litellm_params: + model: cohere/rerank-english-v3.0 + api_key: os.environ/COHERE_API_KEY + - model_name: azure-rerank-english-v3.0 + litellm_params: + model: azure_ai/rerank-english-v3.0 + api_base: os.environ/AZURE_AI_COHERE_API_BASE + api_key: os.environ/AZURE_AI_COHERE_API_KEY + - model_name: "databricks/*" + litellm_params: + model: "databricks/*" + api_key: os.environ/DATABRICKS_API_KEY + api_base: os.environ/DATABRICKS_API_BASE + - model_name: "anthropic/*" + litellm_params: + model: "anthropic/*" + - model_name: "*" + litellm_params: + model: "openai/*" + - model_name: "fireworks_ai/*" + litellm_params: + model: "fireworks_ai/*" + configurable_clientside_auth_params: ["api_base"] + - model_name: "gemini-flash-experimental" + litellm_params: + model: "vertex_ai/gemini-flash-experimental" - - model_name: openai-gpt-4o-realtime-audio - litellm_params: - model: openai/gpt-4o-realtime-preview-2024-10-01 - api_key: os.environ/OPENAI_API_KEY - api_base: http://localhost:8080 \ No newline at end of file +litellm_settings: + turn_off_message_logging: true + # callbacks: + # - prometheus + # - otel + failure_callback: + - sentry + - prometheus + success_callback: + - prometheus + - s3 + s3_callback_params: + s3_bucket_name: mytestbucketlitellm # AWS Bucket Name for S3 + s3_region_name: us-west-2 # AWS Region Name for S3 + s3_aws_access_key_id: os.environ/AWS_ACCESS_KEY_ID # us os.environ/ to pass environment variables. This is AWS Access Key ID for S3 + s3_aws_secret_access_key: os.environ/AWS_SECRET_ACCESS_KEY # AWS Secret Access Key for S3 + +general_settings: + db_url: os.environ/DATABASE_URL + # disable_prisma_schema_update: true \ No newline at end of file diff --git a/litellm/types/utils.py b/litellm/types/utils.py index 429d8cf6e..35d48d9ff 100644 --- a/litellm/types/utils.py +++ b/litellm/types/utils.py @@ -319,7 +319,7 @@ ChatCompletionMessage(content='This is a test', role='assistant', function_call= class Message(OpenAIObject): content: Optional[str] - role: Literal["assistant"] + role: Literal["assistant", "user", "system", "tool", "function"] tool_calls: Optional[List[ChatCompletionMessageToolCall]] function_call: Optional[FunctionCall] @@ -488,14 +488,6 @@ class Usage(CompletionUsage): ] = None, **params, ): - ## DEEPSEEK PROMPT TOKEN HANDLING ## - follow the anthropic format, of having prompt tokens be just the non-cached token input. Enables accurate cost-tracking - Relevant issue: https://github.com/BerriAI/litellm/issues/5285 - if ( - "prompt_cache_miss_tokens" in params - and isinstance(params["prompt_cache_miss_tokens"], int) - and prompt_tokens is not None - ): - prompt_tokens = params["prompt_cache_miss_tokens"] - # handle reasoning_tokens _completion_tokens_details: Optional[CompletionTokensDetails] = None if reasoning_tokens: @@ -512,6 +504,24 @@ class Usage(CompletionUsage): elif isinstance(completion_tokens_details, CompletionTokensDetails): _completion_tokens_details = completion_tokens_details + ## DEEPSEEK MAPPING ## + if "prompt_cache_hit_tokens" in params and isinstance( + params["prompt_cache_hit_tokens"], int + ): + if prompt_tokens_details is None: + prompt_tokens_details = PromptTokensDetails( + cached_tokens=params["prompt_cache_hit_tokens"] + ) + + ## ANTHROPIC MAPPING ## + if "cache_read_input_tokens" in params and isinstance( + params["cache_read_input_tokens"], int + ): + if prompt_tokens_details is None: + prompt_tokens_details = PromptTokensDetails( + cached_tokens=params["cache_read_input_tokens"] + ) + # handle prompt_tokens_details _prompt_tokens_details: Optional[PromptTokensDetails] = None if prompt_tokens_details: diff --git a/tests/local_testing/test_bedrock_completion.py b/tests/local_testing/test_bedrock_completion.py index c39239259..1e65424d2 100644 --- a/tests/local_testing/test_bedrock_completion.py +++ b/tests/local_testing/test_bedrock_completion.py @@ -1633,3 +1633,288 @@ def test_bedrock_completion_test_3(): } } ] + + +@pytest.mark.parametrize("modify_params", [True, False]) +def test_bedrock_completion_test_4(modify_params): + litellm.set_verbose = True + litellm.modify_params = modify_params + + data = { + "model": "anthropic.claude-3-opus-20240229-v1:0", + "messages": [ + { + "role": "user", + "content": [ + {"type": "text", "text": "\nWhat is this file?\n"}, + { + "type": "text", + "text": "\n# VSCode Visible Files\ncomputer-vision/hm-open3d/src/main.py\n\n# VSCode Open Tabs\ncomputer-vision/hm-open3d/src/main.py\n\n# Current Working Directory (/Users/hongbo-miao/Clouds/Git/hongbomiao.com) Files\n.ansible-lint\n.clang-format\n.cmakelintrc\n.dockerignore\n.editorconfig\n.gitignore\n.gitmodules\n.hadolint.yaml\n.isort.cfg\n.markdownlint-cli2.jsonc\n.mergify.yml\n.npmrc\n.nvmrc\n.prettierignore\n.rubocop.yml\n.ruby-version\n.ruff.toml\n.shellcheckrc\n.solhint.json\n.solhintignore\n.sqlfluff\n.sqlfluffignore\n.stylelintignore\n.yamllint.yaml\nCODE_OF_CONDUCT.md\ncommitlint.config.js\nGemfile\nGemfile.lock\nLICENSE\nlint-staged.config.js\nMakefile\nmiss_hit.cfg\nmypy.ini\npackage-lock.json\npackage.json\npoetry.lock\npoetry.toml\nprettier.config.js\npyproject.toml\nREADME.md\nrelease.config.js\nrenovate.json\nSECURITY.md\nstylelint.config.js\naerospace/\naerospace/air-defense-system/\naerospace/hm-aerosandbox/\naerospace/hm-openaerostruct/\naerospace/px4/\naerospace/quadcopter-pd-controller/\naerospace/simulate-satellite/\naerospace/simulated-and-actual-flights/\naerospace/toroidal-propeller/\nansible/\nansible/inventory.yaml\nansible/Makefile\nansible/requirements.yml\nansible/hm_macos_group/\nansible/hm_ubuntu_group/\nansible/hm_windows_group/\napi-go/\napi-go/buf.yaml\napi-go/go.mod\napi-go/go.sum\napi-go/Makefile\napi-go/api/\napi-go/build/\napi-go/cmd/\napi-go/config/\napi-go/internal/\napi-node/\napi-node/.env.development\napi-node/.env.development.local.example\napi-node/.env.development.local.example.docker\napi-node/.env.production\napi-node/.env.production.local.example\napi-node/.env.test\napi-node/.eslintignore\napi-node/.eslintrc.js\napi-node/.npmrc\napi-node/.nvmrc\napi-node/babel.config.js\napi-node/docker-compose.cypress.yaml\napi-node/docker-compose.development.yaml\napi-node/Dockerfile\napi-node/Dockerfile.development\napi-node/jest.config.js\napi-node/Makefile\napi-node/package-lock.json\napi-node/package.json\napi-node/Procfile\napi-node/stryker.conf.js\napi-node/tsconfig.json\napi-node/bin/\napi-node/postgres/\napi-node/scripts/\napi-node/src/\napi-python/\napi-python/.flaskenv\napi-python/docker-entrypoint.sh\napi-python/Dockerfile\napi-python/Makefile\napi-python/poetry.lock\napi-python/poetry.toml\napi-python/pyproject.toml\napi-python/flaskr/\nasterios/\nasterios/led-blinker/\nauthorization/\nauthorization/hm-opal-client/\nauthorization/ory-hydra/\nautomobile/\nautomobile/build-map-by-lidar-point-cloud/\nautomobile/detect-lane-by-lidar-point-cloud/\nbin/\nbin/clean.sh\nbin/count_code_lines.sh\nbin/lint_javascript_fix.sh\nbin/lint_javascript.sh\nbin/set_up.sh\nbiology/\nbiology/compare-nucleotide-sequences/\nbusybox/\nbusybox/Makefile\ncaddy/\ncaddy/Caddyfile\ncaddy/Makefile\ncaddy/bin/\ncloud-computing/\ncloud-computing/hm-ray/\ncloud-computing/hm-skypilot/\ncloud-cost/\ncloud-cost/komiser/\ncloud-infrastructure/\ncloud-infrastructure/hm-pulumi/\ncloud-infrastructure/karpenter/\ncloud-infrastructure/terraform/\ncloud-platform/\ncloud-platform/aws/\ncloud-platform/google-cloud/\ncloud-security/\ncloud-security/hm-prowler/\ncomputational-fluid-dynamics/\ncomputational-fluid-dynamics/matlab/\ncomputational-fluid-dynamics/openfoam/\ncomputer-vision/\ncomputer-vision/hm-open3d/\ncomputer-vision/hm-pyvista/\ndata-analytics/\ndata-analytics/hm-geopandas/\ndata-distribution-service/\ndata-distribution-service/dummy_test.py\ndata-distribution-service/hm_message.idl\ndata-distribution-service/hm_message.xml\ndata-distribution-service/Makefile\ndata-distribution-service/poetry.lock\ndata-distribution-service/poetry.toml\ndata-distribution-service/publish.py\ndata-ingestion/\ndata-orchestration/\ndata-processing/\ndata-storage/\ndata-transformation/\ndata-visualization/\ndesktop-qt/\nembedded/\nethereum/\ngit/\ngolang-migrate/\nhardware-in-the-loop/\nhasura-graphql-engine/\nhigh-performance-computing/\nhm-alpine/\nhm-kafka/\nhm-locust/\nhm-rust/\nhm-traefik/\nhm-xxhash/\nkubernetes/\nmachine-learning/\nmatlab/\nmobile/\nnetwork-programmability/\noperating-system/\nparallel-computing/\nphysics/\nquantum-computing/\nrclone/\nrestic/\nreverse-engineering/\nrobotics/\nsubmodules/\ntrino/\nvagrant/\nvalgrind/\nvhdl/\nvim/\nweb/\nweb-cypress/\nwireless-network/\n\n(File list truncated. Use list_files on specific subdirectories if you need to explore further.)\n", + }, + ], + }, + { + "role": "assistant", + "content": '\nThe user is asking about a specific file: main.py. Based on the environment details provided, this file is located in the computer-vision/hm-open3d/src/ directory and is currently open in a VSCode tab.\n\nTo answer the question of what this file is, the most relevant tool would be the read_file tool. This will allow me to examine the contents of main.py to determine its purpose.\n\nThe read_file tool requires the "path" parameter. I can infer this path based on the environment details:\npath: "computer-vision/hm-open3d/src/main.py"\n\nSince I have the necessary parameter, I can proceed with calling the read_file tool.\n', + "tool_calls": [ + { + "id": "tooluse_qCt-KEyWQlWiyHl26spQVA", + "type": "function", + "function": { + "name": "read_file", + "arguments": '{"path":"computer-vision/hm-open3d/src/main.py"}', + }, + } + ], + }, + { + "role": "tool", + "tool_call_id": "tooluse_qCt-KEyWQlWiyHl26spQVA", + "content": 'import numpy as np\nimport open3d as o3d\n\n\ndef main():\n ply_point_cloud = o3d.data.PLYPointCloud()\n pcd = o3d.io.read_point_cloud(ply_point_cloud.path)\n print(pcd)\n print(np.asarray(pcd.points))\n\n demo_crop_data = o3d.data.DemoCropPointCloud()\n vol = o3d.visualization.read_selection_polygon_volume(\n demo_crop_data.cropped_json_path\n )\n chair = vol.crop_point_cloud(pcd)\n\n dists = pcd.compute_point_cloud_distance(chair)\n dists = np.asarray(dists)\n idx = np.where(dists > 0.01)[0]\n pcd_without_chair = pcd.select_by_index(idx)\n\n axis_aligned_bounding_box = chair.get_axis_aligned_bounding_box()\n axis_aligned_bounding_box.color = (1, 0, 0)\n\n oriented_bounding_box = chair.get_oriented_bounding_box()\n oriented_bounding_box.color = (0, 1, 0)\n\n o3d.visualization.draw_geometries(\n [pcd_without_chair, chair, axis_aligned_bounding_box, oriented_bounding_box],\n zoom=0.3412,\n front=[0.4, -0.2, -0.9],\n lookat=[2.6, 2.0, 1.5],\n up=[-0.10, -1.0, 0.2],\n )\n\n\nif __name__ == "__main__":\n main()\n', + }, + { + "role": "user", + "content": [ + { + "type": "text", + "text": "\n# VSCode Visible Files\ncomputer-vision/hm-open3d/src/main.py\n\n# VSCode Open Tabs\ncomputer-vision/hm-open3d/src/main.py\n", + } + ], + }, + ], + "temperature": 0.2, + "tools": [ + { + "type": "function", + "function": { + "name": "execute_command", + "description": "Execute a CLI command on the system. Use this when you need to perform system operations or run specific commands to accomplish any step in the user's task. You must tailor your command to the user's system and provide a clear explanation of what the command does. Prefer to execute complex CLI commands over creating executable scripts, as they are more flexible and easier to run. Commands will be executed in the current working directory: /Users/hongbo-miao/Clouds/Git/hongbomiao.com", + "parameters": { + "type": "object", + "properties": { + "command": { + "type": "string", + "description": "The CLI command to execute. This should be valid for the current operating system. Ensure the command is properly formatted and does not contain any harmful instructions.", + } + }, + "required": ["command"], + }, + }, + }, + { + "type": "function", + "function": { + "name": "read_file", + "description": "Read the contents of a file at the specified path. Use this when you need to examine the contents of an existing file, for example to analyze code, review text files, or extract information from configuration files. Automatically extracts raw text from PDF and DOCX files. May not be suitable for other types of binary files, as it returns the raw content as a string.", + "parameters": { + "type": "object", + "properties": { + "path": { + "type": "string", + "description": "The path of the file to read (relative to the current working directory /Users/hongbo-miao/Clouds/Git/hongbomiao.com)", + } + }, + "required": ["path"], + }, + }, + }, + { + "type": "function", + "function": { + "name": "write_to_file", + "description": "Write content to a file at the specified path. If the file exists, it will be overwritten with the provided content. If the file doesn't exist, it will be created. Always provide the full intended content of the file, without any truncation. This tool will automatically create any directories needed to write the file.", + "parameters": { + "type": "object", + "properties": { + "path": { + "type": "string", + "description": "The path of the file to write to (relative to the current working directory /Users/hongbo-miao/Clouds/Git/hongbomiao.com)", + }, + "content": { + "type": "string", + "description": "The full content to write to the file.", + }, + }, + "required": ["path", "content"], + }, + }, + }, + { + "type": "function", + "function": { + "name": "search_files", + "description": "Perform a regex search across files in a specified directory, providing context-rich results. This tool searches for patterns or specific content across multiple files, displaying each match with encapsulating context.", + "parameters": { + "type": "object", + "properties": { + "path": { + "type": "string", + "description": "The path of the directory to search in (relative to the current working directory /Users/hongbo-miao/Clouds/Git/hongbomiao.com). This directory will be recursively searched.", + }, + "regex": { + "type": "string", + "description": "The regular expression pattern to search for. Uses Rust regex syntax.", + }, + "filePattern": { + "type": "string", + "description": "Optional glob pattern to filter files (e.g., '*.ts' for TypeScript files). If not provided, it will search all files (*).", + }, + }, + "required": ["path", "regex"], + }, + }, + }, + { + "type": "function", + "function": { + "name": "list_files", + "description": "List files and directories within the specified directory. If recursive is true, it will list all files and directories recursively. If recursive is false or not provided, it will only list the top-level contents.", + "parameters": { + "type": "object", + "properties": { + "path": { + "type": "string", + "description": "The path of the directory to list contents for (relative to the current working directory /Users/hongbo-miao/Clouds/Git/hongbomiao.com)", + }, + "recursive": { + "type": "string", + "enum": ["true", "false"], + "description": "Whether to list files recursively. Use 'true' for recursive listing, 'false' or omit for top-level only.", + }, + }, + "required": ["path"], + }, + }, + }, + { + "type": "function", + "function": { + "name": "list_code_definition_names", + "description": "Lists definition names (classes, functions, methods, etc.) used in source code files at the top level of the specified directory. This tool provides insights into the codebase structure and important constructs, encapsulating high-level concepts and relationships that are crucial for understanding the overall architecture.", + "parameters": { + "type": "object", + "properties": { + "path": { + "type": "string", + "description": "The path of the directory (relative to the current working directory /Users/hongbo-miao/Clouds/Git/hongbomiao.com) to list top level source code definitions for", + } + }, + "required": ["path"], + }, + }, + }, + { + "type": "function", + "function": { + "name": "inspect_site", + "description": "Captures a screenshot and console logs of the initial state of a website. This tool navigates to the specified URL, takes a screenshot of the entire page as it appears immediately after loading, and collects any console logs or errors that occur during page load. It does not interact with the page or capture any state changes after the initial load.", + "parameters": { + "type": "object", + "properties": { + "url": { + "type": "string", + "description": "The URL of the site to inspect. This should be a valid URL including the protocol (e.g. http://localhost:3000/page, file:///path/to/file.html, etc.)", + } + }, + "required": ["url"], + }, + }, + }, + { + "type": "function", + "function": { + "name": "ask_followup_question", + "description": "Ask the user a question to gather additional information needed to complete the task. This tool should be used when you encounter ambiguities, need clarification, or require more details to proceed effectively. It allows for interactive problem-solving by enabling direct communication with the user. Use this tool judiciously to maintain a balance between gathering necessary information and avoiding excessive back-and-forth.", + "parameters": { + "type": "object", + "properties": { + "question": { + "type": "string", + "description": "The question to ask the user. This should be a clear, specific question that addresses the information you need.", + } + }, + "required": ["question"], + }, + }, + }, + { + "type": "function", + "function": { + "name": "attempt_completion", + "description": "Once you've completed the task, use this tool to present the result to the user. Optionally you may provide a CLI command to showcase the result of your work, but avoid using commands like 'echo' or 'cat' that merely print text. They may respond with feedback if they are not satisfied with the result, which you can use to make improvements and try again.", + "parameters": { + "type": "object", + "properties": { + "command": { + "type": "string", + "description": "A CLI command to execute to show a live demo of the result to the user. For example, use 'open index.html' to display a created website. This command should be valid for the current operating system. Ensure the command is properly formatted and does not contain any harmful instructions.", + }, + "result": { + "type": "string", + "description": "The result of the task. Formulate this result in a way that is final and does not require further input from the user. Don't end your result with questions or offers for further assistance.", + }, + }, + "required": ["result"], + }, + }, + }, + ], + "tool_choice": "auto", + } + + if modify_params: + transformed_messages = _bedrock_converse_messages_pt( + messages=data["messages"], model="", llm_provider="" + ) + expected_messages = [ + { + "role": "user", + "content": [ + {"text": "\nWhat is this file?\n"}, + { + "text": "\n# VSCode Visible Files\ncomputer-vision/hm-open3d/src/main.py\n\n# VSCode Open Tabs\ncomputer-vision/hm-open3d/src/main.py\n\n# Current Working Directory (/Users/hongbo-miao/Clouds/Git/hongbomiao.com) Files\n.ansible-lint\n.clang-format\n.cmakelintrc\n.dockerignore\n.editorconfig\n.gitignore\n.gitmodules\n.hadolint.yaml\n.isort.cfg\n.markdownlint-cli2.jsonc\n.mergify.yml\n.npmrc\n.nvmrc\n.prettierignore\n.rubocop.yml\n.ruby-version\n.ruff.toml\n.shellcheckrc\n.solhint.json\n.solhintignore\n.sqlfluff\n.sqlfluffignore\n.stylelintignore\n.yamllint.yaml\nCODE_OF_CONDUCT.md\ncommitlint.config.js\nGemfile\nGemfile.lock\nLICENSE\nlint-staged.config.js\nMakefile\nmiss_hit.cfg\nmypy.ini\npackage-lock.json\npackage.json\npoetry.lock\npoetry.toml\nprettier.config.js\npyproject.toml\nREADME.md\nrelease.config.js\nrenovate.json\nSECURITY.md\nstylelint.config.js\naerospace/\naerospace/air-defense-system/\naerospace/hm-aerosandbox/\naerospace/hm-openaerostruct/\naerospace/px4/\naerospace/quadcopter-pd-controller/\naerospace/simulate-satellite/\naerospace/simulated-and-actual-flights/\naerospace/toroidal-propeller/\nansible/\nansible/inventory.yaml\nansible/Makefile\nansible/requirements.yml\nansible/hm_macos_group/\nansible/hm_ubuntu_group/\nansible/hm_windows_group/\napi-go/\napi-go/buf.yaml\napi-go/go.mod\napi-go/go.sum\napi-go/Makefile\napi-go/api/\napi-go/build/\napi-go/cmd/\napi-go/config/\napi-go/internal/\napi-node/\napi-node/.env.development\napi-node/.env.development.local.example\napi-node/.env.development.local.example.docker\napi-node/.env.production\napi-node/.env.production.local.example\napi-node/.env.test\napi-node/.eslintignore\napi-node/.eslintrc.js\napi-node/.npmrc\napi-node/.nvmrc\napi-node/babel.config.js\napi-node/docker-compose.cypress.yaml\napi-node/docker-compose.development.yaml\napi-node/Dockerfile\napi-node/Dockerfile.development\napi-node/jest.config.js\napi-node/Makefile\napi-node/package-lock.json\napi-node/package.json\napi-node/Procfile\napi-node/stryker.conf.js\napi-node/tsconfig.json\napi-node/bin/\napi-node/postgres/\napi-node/scripts/\napi-node/src/\napi-python/\napi-python/.flaskenv\napi-python/docker-entrypoint.sh\napi-python/Dockerfile\napi-python/Makefile\napi-python/poetry.lock\napi-python/poetry.toml\napi-python/pyproject.toml\napi-python/flaskr/\nasterios/\nasterios/led-blinker/\nauthorization/\nauthorization/hm-opal-client/\nauthorization/ory-hydra/\nautomobile/\nautomobile/build-map-by-lidar-point-cloud/\nautomobile/detect-lane-by-lidar-point-cloud/\nbin/\nbin/clean.sh\nbin/count_code_lines.sh\nbin/lint_javascript_fix.sh\nbin/lint_javascript.sh\nbin/set_up.sh\nbiology/\nbiology/compare-nucleotide-sequences/\nbusybox/\nbusybox/Makefile\ncaddy/\ncaddy/Caddyfile\ncaddy/Makefile\ncaddy/bin/\ncloud-computing/\ncloud-computing/hm-ray/\ncloud-computing/hm-skypilot/\ncloud-cost/\ncloud-cost/komiser/\ncloud-infrastructure/\ncloud-infrastructure/hm-pulumi/\ncloud-infrastructure/karpenter/\ncloud-infrastructure/terraform/\ncloud-platform/\ncloud-platform/aws/\ncloud-platform/google-cloud/\ncloud-security/\ncloud-security/hm-prowler/\ncomputational-fluid-dynamics/\ncomputational-fluid-dynamics/matlab/\ncomputational-fluid-dynamics/openfoam/\ncomputer-vision/\ncomputer-vision/hm-open3d/\ncomputer-vision/hm-pyvista/\ndata-analytics/\ndata-analytics/hm-geopandas/\ndata-distribution-service/\ndata-distribution-service/dummy_test.py\ndata-distribution-service/hm_message.idl\ndata-distribution-service/hm_message.xml\ndata-distribution-service/Makefile\ndata-distribution-service/poetry.lock\ndata-distribution-service/poetry.toml\ndata-distribution-service/publish.py\ndata-ingestion/\ndata-orchestration/\ndata-processing/\ndata-storage/\ndata-transformation/\ndata-visualization/\ndesktop-qt/\nembedded/\nethereum/\ngit/\ngolang-migrate/\nhardware-in-the-loop/\nhasura-graphql-engine/\nhigh-performance-computing/\nhm-alpine/\nhm-kafka/\nhm-locust/\nhm-rust/\nhm-traefik/\nhm-xxhash/\nkubernetes/\nmachine-learning/\nmatlab/\nmobile/\nnetwork-programmability/\noperating-system/\nparallel-computing/\nphysics/\nquantum-computing/\nrclone/\nrestic/\nreverse-engineering/\nrobotics/\nsubmodules/\ntrino/\nvagrant/\nvalgrind/\nvhdl/\nvim/\nweb/\nweb-cypress/\nwireless-network/\n\n(File list truncated. Use list_files on specific subdirectories if you need to explore further.)\n" + }, + ], + }, + { + "role": "assistant", + "content": [ + { + "toolUse": { + "input": {"path": "computer-vision/hm-open3d/src/main.py"}, + "name": "read_file", + "toolUseId": "tooluse_qCt-KEyWQlWiyHl26spQVA", + } + } + ], + }, + { + "role": "user", + "content": [ + { + "toolResult": { + "content": [ + { + "text": 'import numpy as np\nimport open3d as o3d\n\n\ndef main():\n ply_point_cloud = o3d.data.PLYPointCloud()\n pcd = o3d.io.read_point_cloud(ply_point_cloud.path)\n print(pcd)\n print(np.asarray(pcd.points))\n\n demo_crop_data = o3d.data.DemoCropPointCloud()\n vol = o3d.visualization.read_selection_polygon_volume(\n demo_crop_data.cropped_json_path\n )\n chair = vol.crop_point_cloud(pcd)\n\n dists = pcd.compute_point_cloud_distance(chair)\n dists = np.asarray(dists)\n idx = np.where(dists > 0.01)[0]\n pcd_without_chair = pcd.select_by_index(idx)\n\n axis_aligned_bounding_box = chair.get_axis_aligned_bounding_box()\n axis_aligned_bounding_box.color = (1, 0, 0)\n\n oriented_bounding_box = chair.get_oriented_bounding_box()\n oriented_bounding_box.color = (0, 1, 0)\n\n o3d.visualization.draw_geometries(\n [pcd_without_chair, chair, axis_aligned_bounding_box, oriented_bounding_box],\n zoom=0.3412,\n front=[0.4, -0.2, -0.9],\n lookat=[2.6, 2.0, 1.5],\n up=[-0.10, -1.0, 0.2],\n )\n\n\nif __name__ == "__main__":\n main()\n' + } + ], + "toolUseId": "tooluse_qCt-KEyWQlWiyHl26spQVA", + } + } + ], + }, + {"role": "assistant", "content": [{"text": "Please continue."}]}, + { + "role": "user", + "content": [ + { + "text": "\n# VSCode Visible Files\ncomputer-vision/hm-open3d/src/main.py\n\n# VSCode Open Tabs\ncomputer-vision/hm-open3d/src/main.py\n" + } + ], + }, + ] + assert transformed_messages == expected_messages + else: + with pytest.raises(Exception) as e: + litellm.completion(**data) + assert "litellm.modify_params" in str(e.value) diff --git a/tests/local_testing/test_completion_cost.py b/tests/local_testing/test_completion_cost.py index 1ab1f41bb..8408a5051 100644 --- a/tests/local_testing/test_completion_cost.py +++ b/tests/local_testing/test_completion_cost.py @@ -1077,7 +1077,9 @@ def test_completion_cost_deepseek(): assert response_2.usage.prompt_cache_hit_tokens is not None assert response_2.usage.prompt_cache_miss_tokens is not None assert ( - response_2.usage.prompt_tokens == response_2.usage.prompt_cache_miss_tokens + response_2.usage.prompt_tokens + == response_2.usage.prompt_cache_miss_tokens + + response_2.usage.prompt_cache_hit_tokens ) assert ( response_2.usage._cache_read_input_tokens diff --git a/tests/local_testing/test_custom_callback_input.py b/tests/local_testing/test_custom_callback_input.py index efd540e11..b4d716054 100644 --- a/tests/local_testing/test_custom_callback_input.py +++ b/tests/local_testing/test_custom_callback_input.py @@ -1258,6 +1258,7 @@ def test_standard_logging_payload(model, turn_off_message_logging): "standard_logging_object" ] if turn_off_message_logging: + print("checks redacted-by-litellm") assert "redacted-by-litellm" == slobject["messages"][0]["content"] assert "redacted-by-litellm" == slobject["response"] @@ -1307,9 +1308,15 @@ def test_aaastandard_logging_payload_cache_hit(): assert standard_logging_object["saved_cache_cost"] > 0 -def test_logging_async_cache_hit_sync_call(): +@pytest.mark.parametrize( + "turn_off_message_logging", + [False, True], +) # False +def test_logging_async_cache_hit_sync_call(turn_off_message_logging): from litellm.types.utils import StandardLoggingPayload + litellm.turn_off_message_logging = turn_off_message_logging + litellm.cache = Cache() response = litellm.completion( @@ -1356,6 +1363,14 @@ def test_logging_async_cache_hit_sync_call(): assert standard_logging_object["response_cost"] == 0 assert standard_logging_object["saved_cache_cost"] > 0 + if turn_off_message_logging: + print("checks redacted-by-litellm") + assert ( + "redacted-by-litellm" + == standard_logging_object["messages"][0]["content"] + ) + assert "redacted-by-litellm" == standard_logging_object["response"] + def test_logging_standard_payload_failure_call(): from litellm.types.utils import StandardLoggingPayload diff --git a/tests/local_testing/test_prompt_caching.py b/tests/local_testing/test_prompt_caching.py new file mode 100644 index 000000000..d714c58a7 --- /dev/null +++ b/tests/local_testing/test_prompt_caching.py @@ -0,0 +1,81 @@ +"""Asserts that prompt caching information is correctly returned for Anthropic, OpenAI, and Deepseek""" + +import io +import os +import sys + +sys.path.insert(0, os.path.abspath("../..")) + +import litellm +import pytest + + +@pytest.mark.parametrize( + "model", + [ + "anthropic/claude-3-5-sonnet-20240620", + "openai/gpt-4o", + "deepseek/deepseek-chat", + ], +) +def test_prompt_caching_model(model): + for _ in range(2): + response = litellm.completion( + model=model, + messages=[ + # System Message + { + "role": "system", + "content": [ + { + "type": "text", + "text": "Here is the full text of a complex legal agreement" + * 400, + "cache_control": {"type": "ephemeral"}, + } + ], + }, + # marked for caching with the cache_control parameter, so that this checkpoint can read from the previous cache. + { + "role": "user", + "content": [ + { + "type": "text", + "text": "What are the key terms and conditions in this agreement?", + "cache_control": {"type": "ephemeral"}, + } + ], + }, + { + "role": "assistant", + "content": "Certainly! the key terms and conditions are the following: the contract is 1 year long for $10/mo", + }, + # The final turn is marked with cache-control, for continuing in followups. + { + "role": "user", + "content": [ + { + "type": "text", + "text": "What are the key terms and conditions in this agreement?", + "cache_control": {"type": "ephemeral"}, + } + ], + }, + ], + temperature=0.2, + max_tokens=10, + ) + + print("response=", response) + print("response.usage=", response.usage) + + assert "prompt_tokens_details" in response.usage + assert response.usage.prompt_tokens_details.cached_tokens > 0 + + # assert "cache_read_input_tokens" in response.usage + # assert "cache_creation_input_tokens" in response.usage + + # # Assert either a cache entry was created or cache was read - changes depending on the anthropic api ttl + # assert (response.usage.cache_read_input_tokens > 0) or ( + # response.usage.cache_creation_input_tokens > 0 + # ) diff --git a/tests/local_testing/test_sagemaker.py b/tests/local_testing/test_sagemaker.py index fcf5dd71e..0185c7146 100644 --- a/tests/local_testing/test_sagemaker.py +++ b/tests/local_testing/test_sagemaker.py @@ -180,6 +180,40 @@ async def test_completion_sagemaker_stream(sync_mode, model): pytest.fail(f"Error occurred: {e}") +@pytest.mark.asyncio() +@pytest.mark.parametrize("sync_mode", [False, True]) +@pytest.mark.parametrize( + "model", + [ + "sagemaker_chat/huggingface-pytorch-tgi-inference-2024-08-23-15-48-59-245", + "sagemaker/jumpstart-dft-hf-textgeneration1-mp-20240815-185614", + ], +) +async def test_completion_sagemaker_streaming_bad_request(sync_mode, model): + litellm.set_verbose = True + print("testing sagemaker") + if sync_mode is True: + with pytest.raises(litellm.BadRequestError): + response = litellm.completion( + model=model, + messages=[ + {"role": "user", "content": "hi"}, + ], + stream=True, + max_tokens=8000000000000000, + ) + else: + with pytest.raises(litellm.BadRequestError): + response = await litellm.acompletion( + model=model, + messages=[ + {"role": "user", "content": "hi"}, + ], + stream=True, + max_tokens=8000000000000000, + ) + + @pytest.mark.asyncio async def test_acompletion_sagemaker_non_stream(): mock_response = AsyncMock() diff --git a/tests/local_testing/test_utils.py b/tests/local_testing/test_utils.py index 802d63993..a5afcc33e 100644 --- a/tests/local_testing/test_utils.py +++ b/tests/local_testing/test_utils.py @@ -517,17 +517,19 @@ def test_redact_msgs_from_logs(): ] ) + litellm_logging_obj = Logging( + model="gpt-3.5-turbo", + messages=[{"role": "user", "content": "hi"}], + stream=False, + call_type="acompletion", + litellm_call_id="1234", + start_time=datetime.now(), + function_id="1234", + ) + _redacted_response_obj = redact_message_input_output_from_logging( result=response_obj, - litellm_logging_obj=Logging( - model="gpt-3.5-turbo", - messages=[{"role": "user", "content": "hi"}], - stream=False, - call_type="acompletion", - litellm_call_id="1234", - start_time=datetime.now(), - function_id="1234", - ), + model_call_details=litellm_logging_obj.model_call_details, ) # Assert the response_obj content is NOT modified