diff --git a/litellm/llms/bedrock_httpx.py b/litellm/llms/bedrock_httpx.py index d5088f3842..5d22c5ecbc 100644 --- a/litellm/llms/bedrock_httpx.py +++ b/litellm/llms/bedrock_httpx.py @@ -307,7 +307,13 @@ class BedrockLLM(BaseLLM): try: if provider == "cohere": - outputText = completion_response["text"] # type: ignore + if "text" in completion_response: + outputText = completion_response["text"] # type: ignore + elif "generations" in completion_response: + outputText = completion_response["generations"][0]["text"] + model_response["finish_reason"] = map_finish_reason( + completion_response["generations"][0]["finish_reason"] + ) elif provider == "anthropic": if model.startswith("anthropic.claude-3"): json_schemas: dict = {} diff --git a/litellm/main.py b/litellm/main.py index 764ee5bb8a..14fd5439ff 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -1981,21 +1981,60 @@ def completion( # boto3 reads keys from .env custom_prompt_dict = custom_prompt_dict or litellm.custom_prompt_dict - response = bedrock_chat_completion.completion( - model=model, - messages=messages, - custom_prompt_dict=litellm.custom_prompt_dict, - model_response=model_response, - print_verbose=print_verbose, - optional_params=optional_params, - litellm_params=litellm_params, - logger_fn=logger_fn, - encoding=encoding, - logging_obj=logging, - extra_headers=extra_headers, - timeout=timeout, - acompletion=acompletion, - ) + if ( + "aws_bedrock_client" in optional_params + ): # use old bedrock flow for aws_bedrock_client users. + response = bedrock.completion( + model=model, + messages=messages, + custom_prompt_dict=litellm.custom_prompt_dict, + model_response=model_response, + print_verbose=print_verbose, + optional_params=optional_params, + litellm_params=litellm_params, + logger_fn=logger_fn, + encoding=encoding, + logging_obj=logging, + extra_headers=extra_headers, + timeout=timeout, + ) + + if ( + "stream" in optional_params + and optional_params["stream"] == True + and not isinstance(response, CustomStreamWrapper) + ): + # don't try to access stream object, + if "ai21" in model: + response = CustomStreamWrapper( + response, + model, + custom_llm_provider="bedrock", + logging_obj=logging, + ) + else: + response = CustomStreamWrapper( + iter(response), + model, + custom_llm_provider="bedrock", + logging_obj=logging, + ) + else: + response = bedrock_chat_completion.completion( + model=model, + messages=messages, + custom_prompt_dict=custom_prompt_dict, + model_response=model_response, + print_verbose=print_verbose, + optional_params=optional_params, + litellm_params=litellm_params, + logger_fn=logger_fn, + encoding=encoding, + logging_obj=logging, + extra_headers=extra_headers, + timeout=timeout, + acompletion=acompletion, + ) if optional_params.get("stream", False): ## LOGGING logging.post_call( diff --git a/litellm/tests/log.txt b/litellm/tests/log.txt index b3c9d4a09d..c82f142963 100644 --- a/litellm/tests/log.txt +++ b/litellm/tests/log.txt @@ -1,4067 +1,43 @@ ============================= test session starts ============================== -platform darwin -- Python 3.11.9, pytest-7.3.1, pluggy-1.3.0 -- /opt/homebrew/opt/python@3.11/bin/python3.11 -cachedir: .pytest_cache +platform darwin -- Python 3.11.9, pytest-7.3.1, pluggy-1.3.0 rootdir: /Users/krrishdholakia/Documents/litellm/litellm/tests plugins: timeout-2.2.0, asyncio-0.23.2, anyio-3.7.1, xdist-3.3.1 asyncio: mode=Mode.STRICT -collecting ... collected 2 items +collected 2 items -test_streaming.py::test_bedrock_httpx_streaming[bedrock/amazon.titan-tg1-large-False] FAILED [ 50%] - -=================================== FAILURES =================================== -______ test_bedrock_httpx_streaming[bedrock/amazon.titan-tg1-large-False] ______ - -self = -chunk = {'finish_reason': '', 'is_finished': False, 'text': '\nHello, I am an AI model developed by Amazon Titan Foundation Mo...able of understanding and generating human-like text. My development has been focused on continuously improving my pe'} - - def chunk_creator(self, chunk): - model_response = self.model_response_creator() - response_obj = {} - try: - # return this for all models - completion_obj = {"content": ""} - if self.custom_llm_provider and self.custom_llm_provider == "anthropic": - response_obj = self.handle_anthropic_chunk(chunk) - completion_obj["content"] = response_obj["text"] - if response_obj["is_finished"]: - self.received_finish_reason = response_obj["finish_reason"] - elif ( - self.custom_llm_provider - and self.custom_llm_provider == "anthropic_text" - ): - response_obj = self.handle_anthropic_text_chunk(chunk) - completion_obj["content"] = response_obj["text"] - if response_obj["is_finished"]: - self.received_finish_reason = response_obj["finish_reason"] - elif self.custom_llm_provider and self.custom_llm_provider == "clarifai": - response_obj = self.handle_clarifai_completion_chunk(chunk) - completion_obj["content"] = response_obj["text"] - elif self.model == "replicate" or self.custom_llm_provider == "replicate": - response_obj = self.handle_replicate_chunk(chunk) - completion_obj["content"] = response_obj["text"] - if response_obj["is_finished"]: - self.received_finish_reason = response_obj["finish_reason"] - elif self.custom_llm_provider and self.custom_llm_provider == "together_ai": - response_obj = self.handle_together_ai_chunk(chunk) - completion_obj["content"] = response_obj["text"] - if response_obj["is_finished"]: - self.received_finish_reason = response_obj["finish_reason"] - elif self.custom_llm_provider and self.custom_llm_provider == "huggingface": - response_obj = self.handle_huggingface_chunk(chunk) - completion_obj["content"] = response_obj["text"] - if response_obj["is_finished"]: - self.received_finish_reason = response_obj["finish_reason"] - elif self.custom_llm_provider and self.custom_llm_provider == "predibase": - response_obj = self.handle_predibase_chunk(chunk) - completion_obj["content"] = response_obj["text"] - if response_obj["is_finished"]: - self.received_finish_reason = response_obj["finish_reason"] - elif ( - self.custom_llm_provider and self.custom_llm_provider == "baseten" - ): # baseten doesn't provide streaming - completion_obj["content"] = self.handle_baseten_chunk(chunk) - elif ( - self.custom_llm_provider and self.custom_llm_provider == "ai21" - ): # ai21 doesn't provide streaming - response_obj = self.handle_ai21_chunk(chunk) - completion_obj["content"] = response_obj["text"] - if response_obj["is_finished"]: - self.received_finish_reason = response_obj["finish_reason"] - elif self.custom_llm_provider and self.custom_llm_provider == "maritalk": - response_obj = self.handle_maritalk_chunk(chunk) - completion_obj["content"] = response_obj["text"] - if response_obj["is_finished"]: - self.received_finish_reason = response_obj["finish_reason"] - elif self.custom_llm_provider and self.custom_llm_provider == "vllm": - completion_obj["content"] = chunk[0].outputs[0].text - elif ( - self.custom_llm_provider and self.custom_llm_provider == "aleph_alpha" - ): # aleph alpha doesn't provide streaming - response_obj = self.handle_aleph_alpha_chunk(chunk) - completion_obj["content"] = response_obj["text"] - if response_obj["is_finished"]: - self.received_finish_reason = response_obj["finish_reason"] - elif self.custom_llm_provider == "nlp_cloud": - try: - response_obj = self.handle_nlp_cloud_chunk(chunk) - completion_obj["content"] = response_obj["text"] - if response_obj["is_finished"]: - self.received_finish_reason = response_obj["finish_reason"] - except Exception as e: - if self.received_finish_reason: - raise e - else: - if self.sent_first_chunk is False: - raise Exception("An unknown error occurred with the stream") - self.received_finish_reason = "stop" - elif self.custom_llm_provider == "gemini": - if hasattr(chunk, "parts") == True: - try: - if len(chunk.parts) > 0: - completion_obj["content"] = chunk.parts[0].text - if len(chunk.parts) > 0 and hasattr( - chunk.parts[0], "finish_reason" - ): - self.received_finish_reason = chunk.parts[ - 0 - ].finish_reason.name - except: - if chunk.parts[0].finish_reason.name == "SAFETY": - raise Exception( - f"The response was blocked by VertexAI. {str(chunk)}" - ) - else: - completion_obj["content"] = str(chunk) - elif self.custom_llm_provider and (self.custom_llm_provider == "vertex_ai"): - import proto # type: ignore - - if self.model.startswith("claude-3"): - response_obj = self.handle_vertexai_anthropic_chunk(chunk=chunk) - if response_obj is None: - return - completion_obj["content"] = response_obj["text"] - setattr(model_response, "usage", Usage()) - if response_obj.get("prompt_tokens", None) is not None: - model_response.usage.prompt_tokens = response_obj[ - "prompt_tokens" - ] - if response_obj.get("completion_tokens", None) is not None: - model_response.usage.completion_tokens = response_obj[ - "completion_tokens" - ] - if hasattr(model_response.usage, "prompt_tokens"): - model_response.usage.total_tokens = ( - getattr(model_response.usage, "total_tokens", 0) - + model_response.usage.prompt_tokens - ) - if hasattr(model_response.usage, "completion_tokens"): - model_response.usage.total_tokens = ( - getattr(model_response.usage, "total_tokens", 0) - + model_response.usage.completion_tokens - ) - - if response_obj["is_finished"]: - self.received_finish_reason = response_obj["finish_reason"] - elif hasattr(chunk, "candidates") == True: - try: - try: - completion_obj["content"] = chunk.text - except Exception as e: - if "Part has no text." in str(e): - ## check for function calling - function_call = ( - chunk.candidates[0].content.parts[0].function_call - ) - - args_dict = {} - - # Check if it's a RepeatedComposite instance - for key, val in function_call.args.items(): - if isinstance( - val, - proto.marshal.collections.repeated.RepeatedComposite, - ): - # If so, convert to list - args_dict[key] = [v for v in val] - else: - args_dict[key] = val - - try: - args_str = json.dumps(args_dict) - except Exception as e: - raise e - _delta_obj = litellm.utils.Delta( - content=None, - tool_calls=[ - { - "id": f"call_{str(uuid.uuid4())}", - "function": { - "arguments": args_str, - "name": function_call.name, - }, - "type": "function", - } - ], - ) - _streaming_response = StreamingChoices(delta=_delta_obj) - _model_response = ModelResponse(stream=True) - _model_response.choices = [_streaming_response] - response_obj = {"original_chunk": _model_response} - else: - raise e - if ( - hasattr(chunk.candidates[0], "finish_reason") - and chunk.candidates[0].finish_reason.name - != "FINISH_REASON_UNSPECIFIED" - ): # every non-final chunk in vertex ai has this - self.received_finish_reason = chunk.candidates[ - 0 - ].finish_reason.name - except Exception as e: - if chunk.candidates[0].finish_reason.name == "SAFETY": - raise Exception( - f"The response was blocked by VertexAI. {str(chunk)}" - ) - else: - completion_obj["content"] = str(chunk) - elif self.custom_llm_provider == "cohere": - response_obj = self.handle_cohere_chunk(chunk) - completion_obj["content"] = response_obj["text"] - if response_obj["is_finished"]: - self.received_finish_reason = response_obj["finish_reason"] - elif self.custom_llm_provider == "cohere_chat": - response_obj = self.handle_cohere_chat_chunk(chunk) - if response_obj is None: - return - completion_obj["content"] = response_obj["text"] - if response_obj["is_finished"]: - self.received_finish_reason = response_obj["finish_reason"] - elif self.custom_llm_provider == "bedrock": - if self.received_finish_reason is not None: - raise StopIteration -> response_obj = self.handle_bedrock_stream(chunk) - -../utils.py:11034: -_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ - -self = , chunk = None - - def handle_bedrock_stream(self, chunk): - if "cohere" in self.model or "anthropic" in self.model: - return { - "text": chunk["text"], - "is_finished": chunk["is_finished"], - "finish_reason": chunk["finish_reason"], - } - if hasattr(chunk, "get"): - chunk = chunk.get("chunk") -> chunk_data = json.loads(chunk.get("bytes").decode()) -E AttributeError: 'NoneType' object has no attribute 'get' - -../utils.py:10648: AttributeError - -During handling of the above exception, another exception occurred: - -sync_mode = False, model = 'bedrock/amazon.titan-tg1-large' - - @pytest.mark.parametrize("sync_mode", [True, False]) - @pytest.mark.parametrize( - "model", - [ - # "bedrock/cohere.command-r-plus-v1:0", - # "anthropic.claude-3-sonnet-20240229-v1:0", - # "anthropic.claude-instant-v1", - # "bedrock/ai21.j2-mid", - # "mistral.mistral-7b-instruct-v0:2", - "bedrock/amazon.titan-tg1-large", - # "meta.llama3-8b-instruct-v1:0", - ], - ) - @pytest.mark.asyncio - async def test_bedrock_httpx_streaming(sync_mode, model): - try: - litellm.set_verbose = True - if sync_mode: - final_chunk: Optional[litellm.ModelResponse] = None - response: litellm.CustomStreamWrapper = completion( # type: ignore - model=model, - messages=messages, - max_tokens=10, # type: ignore - stream=True, - ) - complete_response = "" - # Add any assertions here to check the response - has_finish_reason = False - for idx, chunk in enumerate(response): - final_chunk = chunk - chunk, finished = streaming_format_tests(idx, chunk) - if finished: - has_finish_reason = True - break - complete_response += chunk - if has_finish_reason == False: - raise Exception("finish reason not set") - if complete_response.strip() == "": - raise Exception("Empty response received") - else: - response: litellm.CustomStreamWrapper = await litellm.acompletion( # type: ignore - model=model, - messages=messages, - max_tokens=100, # type: ignore - stream=True, - ) - complete_response = "" - # Add any assertions here to check the response - has_finish_reason = False - idx = 0 - final_chunk: Optional[litellm.ModelResponse] = None -> async for chunk in response: - -test_streaming.py:1094: -_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ - -self = - - async def __anext__(self): - try: - if ( - self.custom_llm_provider == "openai" - or self.custom_llm_provider == "azure" - or self.custom_llm_provider == "custom_openai" - or self.custom_llm_provider == "text-completion-openai" - or self.custom_llm_provider == "azure_text" - or self.custom_llm_provider == "anthropic" - or self.custom_llm_provider == "anthropic_text" - or self.custom_llm_provider == "huggingface" - or self.custom_llm_provider == "ollama" - or self.custom_llm_provider == "ollama_chat" - or self.custom_llm_provider == "vertex_ai" - or self.custom_llm_provider == "sagemaker" - or self.custom_llm_provider == "gemini" - or self.custom_llm_provider == "replicate" - or self.custom_llm_provider == "cached_response" - or self.custom_llm_provider == "predibase" - or self.custom_llm_provider == "bedrock" - or self.custom_llm_provider in litellm.openai_compatible_endpoints - ): - async for chunk in self.completion_stream: - print_verbose(f"value of async chunk: {chunk}") - if chunk == "None" or chunk is None: - raise Exception - elif ( - self.custom_llm_provider == "gemini" - and hasattr(chunk, "parts") - and len(chunk.parts) == 0 - ): - continue - # chunk_creator() does logging/stream chunk building. We need to let it know its being called in_async_func, so we don't double add chunks. - # __anext__ also calls async_success_handler, which does logging - print_verbose(f"PROCESSED ASYNC CHUNK PRE CHUNK CREATOR: {chunk}") - - processed_chunk: Optional[ModelResponse] = self.chunk_creator( - chunk=chunk - ) - print_verbose( - f"PROCESSED ASYNC CHUNK POST CHUNK CREATOR: {processed_chunk}" - ) - if processed_chunk is None: - continue - ## LOGGING - threading.Thread( - target=self.logging_obj.success_handler, args=(processed_chunk,) - ).start() # log response - asyncio.create_task( - self.logging_obj.async_success_handler( - processed_chunk, - ) - ) - self.response_uptil_now += ( - processed_chunk.choices[0].delta.get("content", "") or "" - ) - self.rules.post_call_rules( - input=self.response_uptil_now, model=self.model - ) - print_verbose(f"final returned processed chunk: {processed_chunk}") - return processed_chunk - raise StopAsyncIteration - else: # temporary patch for non-aiohttp async calls - # example - boto3 bedrock llms - while True: - if isinstance(self.completion_stream, str) or isinstance( - self.completion_stream, bytes - ): - chunk = self.completion_stream - else: - chunk = next(self.completion_stream) - if chunk is not None and chunk != b"": - print_verbose(f"PROCESSED CHUNK PRE CHUNK CREATOR: {chunk}") - processed_chunk: Optional[ModelResponse] = self.chunk_creator( - chunk=chunk - ) - print_verbose( - f"PROCESSED CHUNK POST CHUNK CREATOR: {processed_chunk}" - ) - if processed_chunk is None: - continue - ## LOGGING - threading.Thread( - target=self.logging_obj.success_handler, - args=(processed_chunk,), - ).start() # log processed_chunk - asyncio.create_task( - self.logging_obj.async_success_handler( - processed_chunk, - ) - ) - - self.response_uptil_now += ( - processed_chunk.choices[0].delta.get("content", "") or "" - ) - self.rules.post_call_rules( - input=self.response_uptil_now, model=self.model - ) - # RETURN RESULT - return processed_chunk - except StopAsyncIteration: - if self.sent_last_chunk == True: - raise # Re-raise StopIteration - else: - self.sent_last_chunk = True - processed_chunk = self.finish_reason_handler() - ## LOGGING - threading.Thread( - target=self.logging_obj.success_handler, args=(processed_chunk,) - ).start() # log response - asyncio.create_task( - self.logging_obj.async_success_handler( - processed_chunk, - ) - ) - return processed_chunk - except StopIteration: - if self.sent_last_chunk == True: - raise StopAsyncIteration - else: - self.sent_last_chunk = True - processed_chunk = self.finish_reason_handler() - ## LOGGING - threading.Thread( - target=self.logging_obj.success_handler, args=(processed_chunk,) - ).start() # log response - asyncio.create_task( - self.logging_obj.async_success_handler( - processed_chunk, - ) - ) - return processed_chunk - except Exception as e: - traceback_exception = traceback.format_exc() - # Handle any exceptions that might occur during streaming - asyncio.create_task( - self.logging_obj.async_failure_handler(e, traceback_exception) - ) -> raise e - -../utils.py:11630: -_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ - -self = - - async def __anext__(self): - try: - if ( - self.custom_llm_provider == "openai" - or self.custom_llm_provider == "azure" - or self.custom_llm_provider == "custom_openai" - or self.custom_llm_provider == "text-completion-openai" - or self.custom_llm_provider == "azure_text" - or self.custom_llm_provider == "anthropic" - or self.custom_llm_provider == "anthropic_text" - or self.custom_llm_provider == "huggingface" - or self.custom_llm_provider == "ollama" - or self.custom_llm_provider == "ollama_chat" - or self.custom_llm_provider == "vertex_ai" - or self.custom_llm_provider == "sagemaker" - or self.custom_llm_provider == "gemini" - or self.custom_llm_provider == "replicate" - or self.custom_llm_provider == "cached_response" - or self.custom_llm_provider == "predibase" - or self.custom_llm_provider == "bedrock" - or self.custom_llm_provider in litellm.openai_compatible_endpoints - ): - async for chunk in self.completion_stream: - print_verbose(f"value of async chunk: {chunk}") - if chunk == "None" or chunk is None: - raise Exception - elif ( - self.custom_llm_provider == "gemini" - and hasattr(chunk, "parts") - and len(chunk.parts) == 0 - ): - continue - # chunk_creator() does logging/stream chunk building. We need to let it know its being called in_async_func, so we don't double add chunks. - # __anext__ also calls async_success_handler, which does logging - print_verbose(f"PROCESSED ASYNC CHUNK PRE CHUNK CREATOR: {chunk}") - -> processed_chunk: Optional[ModelResponse] = self.chunk_creator( - chunk=chunk - ) - -../utils.py:11528: -_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ - -self = -chunk = {'finish_reason': '', 'is_finished': False, 'text': '\nHello, I am an AI model developed by Amazon Titan Foundation Mo...able of understanding and generating human-like text. My development has been focused on continuously improving my pe'} - - def chunk_creator(self, chunk): - model_response = self.model_response_creator() - response_obj = {} - try: - # return this for all models - completion_obj = {"content": ""} - if self.custom_llm_provider and self.custom_llm_provider == "anthropic": - response_obj = self.handle_anthropic_chunk(chunk) - completion_obj["content"] = response_obj["text"] - if response_obj["is_finished"]: - self.received_finish_reason = response_obj["finish_reason"] - elif ( - self.custom_llm_provider - and self.custom_llm_provider == "anthropic_text" - ): - response_obj = self.handle_anthropic_text_chunk(chunk) - completion_obj["content"] = response_obj["text"] - if response_obj["is_finished"]: - self.received_finish_reason = response_obj["finish_reason"] - elif self.custom_llm_provider and self.custom_llm_provider == "clarifai": - response_obj = self.handle_clarifai_completion_chunk(chunk) - completion_obj["content"] = response_obj["text"] - elif self.model == "replicate" or self.custom_llm_provider == "replicate": - response_obj = self.handle_replicate_chunk(chunk) - completion_obj["content"] = response_obj["text"] - if response_obj["is_finished"]: - self.received_finish_reason = response_obj["finish_reason"] - elif self.custom_llm_provider and self.custom_llm_provider == "together_ai": - response_obj = self.handle_together_ai_chunk(chunk) - completion_obj["content"] = response_obj["text"] - if response_obj["is_finished"]: - self.received_finish_reason = response_obj["finish_reason"] - elif self.custom_llm_provider and self.custom_llm_provider == "huggingface": - response_obj = self.handle_huggingface_chunk(chunk) - completion_obj["content"] = response_obj["text"] - if response_obj["is_finished"]: - self.received_finish_reason = response_obj["finish_reason"] - elif self.custom_llm_provider and self.custom_llm_provider == "predibase": - response_obj = self.handle_predibase_chunk(chunk) - completion_obj["content"] = response_obj["text"] - if response_obj["is_finished"]: - self.received_finish_reason = response_obj["finish_reason"] - elif ( - self.custom_llm_provider and self.custom_llm_provider == "baseten" - ): # baseten doesn't provide streaming - completion_obj["content"] = self.handle_baseten_chunk(chunk) - elif ( - self.custom_llm_provider and self.custom_llm_provider == "ai21" - ): # ai21 doesn't provide streaming - response_obj = self.handle_ai21_chunk(chunk) - completion_obj["content"] = response_obj["text"] - if response_obj["is_finished"]: - self.received_finish_reason = response_obj["finish_reason"] - elif self.custom_llm_provider and self.custom_llm_provider == "maritalk": - response_obj = self.handle_maritalk_chunk(chunk) - completion_obj["content"] = response_obj["text"] - if response_obj["is_finished"]: - self.received_finish_reason = response_obj["finish_reason"] - elif self.custom_llm_provider and self.custom_llm_provider == "vllm": - completion_obj["content"] = chunk[0].outputs[0].text - elif ( - self.custom_llm_provider and self.custom_llm_provider == "aleph_alpha" - ): # aleph alpha doesn't provide streaming - response_obj = self.handle_aleph_alpha_chunk(chunk) - completion_obj["content"] = response_obj["text"] - if response_obj["is_finished"]: - self.received_finish_reason = response_obj["finish_reason"] - elif self.custom_llm_provider == "nlp_cloud": - try: - response_obj = self.handle_nlp_cloud_chunk(chunk) - completion_obj["content"] = response_obj["text"] - if response_obj["is_finished"]: - self.received_finish_reason = response_obj["finish_reason"] - except Exception as e: - if self.received_finish_reason: - raise e - else: - if self.sent_first_chunk is False: - raise Exception("An unknown error occurred with the stream") - self.received_finish_reason = "stop" - elif self.custom_llm_provider == "gemini": - if hasattr(chunk, "parts") == True: - try: - if len(chunk.parts) > 0: - completion_obj["content"] = chunk.parts[0].text - if len(chunk.parts) > 0 and hasattr( - chunk.parts[0], "finish_reason" - ): - self.received_finish_reason = chunk.parts[ - 0 - ].finish_reason.name - except: - if chunk.parts[0].finish_reason.name == "SAFETY": - raise Exception( - f"The response was blocked by VertexAI. {str(chunk)}" - ) - else: - completion_obj["content"] = str(chunk) - elif self.custom_llm_provider and (self.custom_llm_provider == "vertex_ai"): - import proto # type: ignore - - if self.model.startswith("claude-3"): - response_obj = self.handle_vertexai_anthropic_chunk(chunk=chunk) - if response_obj is None: - return - completion_obj["content"] = response_obj["text"] - setattr(model_response, "usage", Usage()) - if response_obj.get("prompt_tokens", None) is not None: - model_response.usage.prompt_tokens = response_obj[ - "prompt_tokens" - ] - if response_obj.get("completion_tokens", None) is not None: - model_response.usage.completion_tokens = response_obj[ - "completion_tokens" - ] - if hasattr(model_response.usage, "prompt_tokens"): - model_response.usage.total_tokens = ( - getattr(model_response.usage, "total_tokens", 0) - + model_response.usage.prompt_tokens - ) - if hasattr(model_response.usage, "completion_tokens"): - model_response.usage.total_tokens = ( - getattr(model_response.usage, "total_tokens", 0) - + model_response.usage.completion_tokens - ) - - if response_obj["is_finished"]: - self.received_finish_reason = response_obj["finish_reason"] - elif hasattr(chunk, "candidates") == True: - try: - try: - completion_obj["content"] = chunk.text - except Exception as e: - if "Part has no text." in str(e): - ## check for function calling - function_call = ( - chunk.candidates[0].content.parts[0].function_call - ) - - args_dict = {} - - # Check if it's a RepeatedComposite instance - for key, val in function_call.args.items(): - if isinstance( - val, - proto.marshal.collections.repeated.RepeatedComposite, - ): - # If so, convert to list - args_dict[key] = [v for v in val] - else: - args_dict[key] = val - - try: - args_str = json.dumps(args_dict) - except Exception as e: - raise e - _delta_obj = litellm.utils.Delta( - content=None, - tool_calls=[ - { - "id": f"call_{str(uuid.uuid4())}", - "function": { - "arguments": args_str, - "name": function_call.name, - }, - "type": "function", - } - ], - ) - _streaming_response = StreamingChoices(delta=_delta_obj) - _model_response = ModelResponse(stream=True) - _model_response.choices = [_streaming_response] - response_obj = {"original_chunk": _model_response} - else: - raise e - if ( - hasattr(chunk.candidates[0], "finish_reason") - and chunk.candidates[0].finish_reason.name - != "FINISH_REASON_UNSPECIFIED" - ): # every non-final chunk in vertex ai has this - self.received_finish_reason = chunk.candidates[ - 0 - ].finish_reason.name - except Exception as e: - if chunk.candidates[0].finish_reason.name == "SAFETY": - raise Exception( - f"The response was blocked by VertexAI. {str(chunk)}" - ) - else: - completion_obj["content"] = str(chunk) - elif self.custom_llm_provider == "cohere": - response_obj = self.handle_cohere_chunk(chunk) - completion_obj["content"] = response_obj["text"] - if response_obj["is_finished"]: - self.received_finish_reason = response_obj["finish_reason"] - elif self.custom_llm_provider == "cohere_chat": - response_obj = self.handle_cohere_chat_chunk(chunk) - if response_obj is None: - return - completion_obj["content"] = response_obj["text"] - if response_obj["is_finished"]: - self.received_finish_reason = response_obj["finish_reason"] - elif self.custom_llm_provider == "bedrock": - if self.received_finish_reason is not None: - raise StopIteration - response_obj = self.handle_bedrock_stream(chunk) - completion_obj["content"] = response_obj["text"] - if response_obj["is_finished"]: - self.received_finish_reason = response_obj["finish_reason"] - elif self.custom_llm_provider == "sagemaker": - print_verbose(f"ENTERS SAGEMAKER STREAMING for chunk {chunk}") - response_obj = self.handle_sagemaker_stream(chunk) - completion_obj["content"] = response_obj["text"] - if response_obj["is_finished"]: - self.received_finish_reason = response_obj["finish_reason"] - elif self.custom_llm_provider == "petals": - if len(self.completion_stream) == 0: - if self.received_finish_reason is not None: - raise StopIteration - else: - self.received_finish_reason = "stop" - chunk_size = 30 - new_chunk = self.completion_stream[:chunk_size] - completion_obj["content"] = new_chunk - self.completion_stream = self.completion_stream[chunk_size:] - time.sleep(0.05) - elif self.custom_llm_provider == "palm": - # fake streaming - response_obj = {} - if len(self.completion_stream) == 0: - if self.received_finish_reason is not None: - raise StopIteration - else: - self.received_finish_reason = "stop" - chunk_size = 30 - new_chunk = self.completion_stream[:chunk_size] - completion_obj["content"] = new_chunk - self.completion_stream = self.completion_stream[chunk_size:] - time.sleep(0.05) - elif self.custom_llm_provider == "ollama": - response_obj = self.handle_ollama_stream(chunk) - completion_obj["content"] = response_obj["text"] - print_verbose(f"completion obj content: {completion_obj['content']}") - if response_obj["is_finished"]: - self.received_finish_reason = response_obj["finish_reason"] - elif self.custom_llm_provider == "ollama_chat": - response_obj = self.handle_ollama_chat_stream(chunk) - completion_obj["content"] = response_obj["text"] - print_verbose(f"completion obj content: {completion_obj['content']}") - if response_obj["is_finished"]: - self.received_finish_reason = response_obj["finish_reason"] - elif self.custom_llm_provider == "cloudflare": - response_obj = self.handle_cloudlfare_stream(chunk) - completion_obj["content"] = response_obj["text"] - print_verbose(f"completion obj content: {completion_obj['content']}") - if response_obj["is_finished"]: - self.received_finish_reason = response_obj["finish_reason"] - elif self.custom_llm_provider == "watsonx": - response_obj = self.handle_watsonx_stream(chunk) - completion_obj["content"] = response_obj["text"] - if response_obj["is_finished"]: - self.received_finish_reason = response_obj["finish_reason"] - elif self.custom_llm_provider == "text-completion-openai": - response_obj = self.handle_openai_text_completion_chunk(chunk) - completion_obj["content"] = response_obj["text"] - print_verbose(f"completion obj content: {completion_obj['content']}") - if response_obj["is_finished"]: - self.received_finish_reason = response_obj["finish_reason"] - if ( - self.stream_options - and self.stream_options.get("include_usage", False) == True - ): - model_response.usage = response_obj["usage"] - elif self.custom_llm_provider == "azure_text": - response_obj = self.handle_azure_text_completion_chunk(chunk) - completion_obj["content"] = response_obj["text"] - print_verbose(f"completion obj content: {completion_obj['content']}") - if response_obj["is_finished"]: - self.received_finish_reason = response_obj["finish_reason"] - elif self.custom_llm_provider == "cached_response": - response_obj = { - "text": chunk.choices[0].delta.content, - "is_finished": True, - "finish_reason": chunk.choices[0].finish_reason, - "original_chunk": chunk, - } - - completion_obj["content"] = response_obj["text"] - print_verbose(f"completion obj content: {completion_obj['content']}") - if hasattr(chunk, "id"): - model_response.id = chunk.id - self.response_id = chunk.id - if hasattr(chunk, "system_fingerprint"): - self.system_fingerprint = chunk.system_fingerprint - if response_obj["is_finished"]: - self.received_finish_reason = response_obj["finish_reason"] - else: # openai / azure chat model - if self.custom_llm_provider == "azure": - if hasattr(chunk, "model"): - # for azure, we need to pass the model from the orignal chunk - self.model = chunk.model - response_obj = self.handle_openai_chat_completion_chunk(chunk) - if response_obj == None: - return - completion_obj["content"] = response_obj["text"] - print_verbose(f"completion obj content: {completion_obj['content']}") - if response_obj["is_finished"]: - if response_obj["finish_reason"] == "error": - raise Exception( - "Mistral API raised a streaming error - finish_reason: error, no content string given." - ) - self.received_finish_reason = response_obj["finish_reason"] - if response_obj.get("original_chunk", None) is not None: - if hasattr(response_obj["original_chunk"], "id"): - model_response.id = response_obj["original_chunk"].id - self.response_id = model_response.id - if hasattr(response_obj["original_chunk"], "system_fingerprint"): - model_response.system_fingerprint = response_obj[ - "original_chunk" - ].system_fingerprint - self.system_fingerprint = response_obj[ - "original_chunk" - ].system_fingerprint - if response_obj["logprobs"] is not None: - model_response.choices[0].logprobs = response_obj["logprobs"] - - if ( - self.stream_options is not None - and self.stream_options["include_usage"] == True - ): - model_response.usage = response_obj["usage"] - - model_response.model = self.model - print_verbose( - f"model_response finish reason 3: {self.received_finish_reason}; response_obj={response_obj}" - ) - ## FUNCTION CALL PARSING - if ( - response_obj is not None - and response_obj.get("original_chunk", None) is not None - ): # function / tool calling branch - only set for openai/azure compatible endpoints - # enter this branch when no content has been passed in response - original_chunk = response_obj.get("original_chunk", None) - model_response.id = original_chunk.id - self.response_id = original_chunk.id - if len(original_chunk.choices) > 0: - if ( - original_chunk.choices[0].delta.function_call is not None - or original_chunk.choices[0].delta.tool_calls is not None - ): - try: - delta = original_chunk.choices[0].delta - model_response.system_fingerprint = ( - original_chunk.system_fingerprint - ) - ## AZURE - check if arguments is not None - if ( - original_chunk.choices[0].delta.function_call - is not None - ): - if ( - getattr( - original_chunk.choices[0].delta.function_call, - "arguments", - ) - is None - ): - original_chunk.choices[ - 0 - ].delta.function_call.arguments = "" - elif original_chunk.choices[0].delta.tool_calls is not None: - if isinstance( - original_chunk.choices[0].delta.tool_calls, list - ): - for t in original_chunk.choices[0].delta.tool_calls: - if hasattr(t, "functions") and hasattr( - t.functions, "arguments" - ): - if ( - getattr( - t.function, - "arguments", - ) - is None - ): - t.function.arguments = "" - _json_delta = delta.model_dump() - print_verbose(f"_json_delta: {_json_delta}") - if "role" not in _json_delta or _json_delta["role"] is None: - _json_delta["role"] = ( - "assistant" # mistral's api returns role as None - ) - if "tool_calls" in _json_delta and isinstance( - _json_delta["tool_calls"], list - ): - for tool in _json_delta["tool_calls"]: - if ( - isinstance(tool, dict) - and "function" in tool - and isinstance(tool["function"], dict) - and ("type" not in tool or tool["type"] is None) - ): - # if function returned but type set to None - mistral's api returns type: None - tool["type"] = "function" - model_response.choices[0].delta = Delta(**_json_delta) - except Exception as e: - traceback.print_exc() - model_response.choices[0].delta = Delta() - else: - try: - delta = dict(original_chunk.choices[0].delta) - print_verbose(f"original delta: {delta}") - model_response.choices[0].delta = Delta(**delta) - print_verbose( - f"new delta: {model_response.choices[0].delta}" - ) - except Exception as e: - model_response.choices[0].delta = Delta() - else: - if ( - self.stream_options is not None - and self.stream_options["include_usage"] == True - ): - return model_response - return - print_verbose( - f"model_response.choices[0].delta: {model_response.choices[0].delta}; completion_obj: {completion_obj}" - ) - print_verbose(f"self.sent_first_chunk: {self.sent_first_chunk}") - - ## RETURN ARG - if ( - "content" in completion_obj - and isinstance(completion_obj["content"], str) - and len(completion_obj["content"]) == 0 - and hasattr(model_response, "usage") - and hasattr(model_response.usage, "prompt_tokens") - ): - if self.sent_first_chunk == False: - completion_obj["role"] = "assistant" - self.sent_first_chunk = True - model_response.choices[0].delta = Delta(**completion_obj) - print_verbose(f"returning model_response: {model_response}") - return model_response - elif ( - "content" in completion_obj - and isinstance(completion_obj["content"], str) - and len(completion_obj["content"]) > 0 - ): # cannot set content of an OpenAI Object to be an empty string - hold, model_response_str = self.check_special_tokens( - chunk=completion_obj["content"], - finish_reason=model_response.choices[0].finish_reason, - ) # filter out bos/eos tokens from openai-compatible hf endpoints - print_verbose( - f"hold - {hold}, model_response_str - {model_response_str}" - ) - if hold is False: - ## check if openai/azure chunk - original_chunk = response_obj.get("original_chunk", None) - if original_chunk: - model_response.id = original_chunk.id - self.response_id = original_chunk.id - if len(original_chunk.choices) > 0: - choices = [] - for idx, choice in enumerate(original_chunk.choices): - try: - if isinstance(choice, BaseModel): - try: - choice_json = choice.model_dump() - except Exception as e: - choice_json = choice.dict() - choice_json.pop( - "finish_reason", None - ) # for mistral etc. which return a value in their last chunk (not-openai compatible). - print_verbose(f"choice_json: {choice_json}") - choices.append(StreamingChoices(**choice_json)) - except Exception as e: - choices.append(StreamingChoices()) - print_verbose(f"choices in streaming: {choices}") - model_response.choices = choices - else: - return - model_response.system_fingerprint = ( - original_chunk.system_fingerprint - ) - print_verbose(f"self.sent_first_chunk: {self.sent_first_chunk}") - if self.sent_first_chunk == False: - model_response.choices[0].delta["role"] = "assistant" - self.sent_first_chunk = True - elif self.sent_first_chunk == True and hasattr( - model_response.choices[0].delta, "role" - ): - _initial_delta = model_response.choices[ - 0 - ].delta.model_dump() - _initial_delta.pop("role", None) - model_response.choices[0].delta = Delta(**_initial_delta) - print_verbose( - f"model_response.choices[0].delta: {model_response.choices[0].delta}" - ) - else: - ## else - completion_obj["content"] = model_response_str - if self.sent_first_chunk == False: - completion_obj["role"] = "assistant" - self.sent_first_chunk = True - model_response.choices[0].delta = Delta(**completion_obj) - print_verbose(f"returning model_response: {model_response}") - return model_response - else: - return - elif self.received_finish_reason is not None: - if self.sent_last_chunk == True: - raise StopIteration - # flush any remaining holding chunk - if len(self.holding_chunk) > 0: - if model_response.choices[0].delta.content is None: - model_response.choices[0].delta.content = self.holding_chunk - else: - model_response.choices[0].delta.content = ( - self.holding_chunk + model_response.choices[0].delta.content - ) - self.holding_chunk = "" - # if delta is None - _is_delta_empty = self.is_delta_empty( - delta=model_response.choices[0].delta - ) - - if _is_delta_empty: - # get any function call arguments - model_response.choices[0].finish_reason = map_finish_reason( - finish_reason=self.received_finish_reason - ) # ensure consistent output to openai - self.sent_last_chunk = True - - return model_response - elif ( - model_response.choices[0].delta.tool_calls is not None - or model_response.choices[0].delta.function_call is not None - ): - if self.sent_first_chunk == False: - model_response.choices[0].delta["role"] = "assistant" - self.sent_first_chunk = True - return model_response - else: - return - except StopIteration: - raise StopIteration - except Exception as e: - traceback_exception = traceback.format_exc() - e.message = str(e) -> raise exception_type( - model=self.model, - custom_llm_provider=self.custom_llm_provider, - original_exception=e, - ) - -../utils.py:11380: -_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ - -model = 'amazon.titan-tg1-large' -original_exception = AttributeError("'NoneType' object has no attribute 'get'") -custom_llm_provider = 'bedrock', completion_kwargs = {}, extra_kwargs = {} - - def exception_type( - model, - original_exception, - custom_llm_provider, - completion_kwargs={}, - extra_kwargs={}, - ): - global user_logger_fn, liteDebuggerClient - exception_mapping_worked = False - if litellm.suppress_debug_info is False: - print() # noqa - print( # noqa - "\033[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new\033[0m" # noqa - ) # noqa - print( # noqa - "LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'." # noqa - ) # noqa - print() # noqa - try: - if model: - error_str = str(original_exception) - if isinstance(original_exception, BaseException): - exception_type = type(original_exception).__name__ - else: - exception_type = "" - - ################################################################################ - # Common Extra information needed for all providers - # We pass num retries, api_base, vertex_deployment etc to the exception here - ################################################################################ - extra_information = "" - try: - _api_base = litellm.get_api_base( - model=model, optional_params=extra_kwargs - ) - messages = litellm.get_first_chars_messages(kwargs=completion_kwargs) - _vertex_project = extra_kwargs.get("vertex_project") - _vertex_location = extra_kwargs.get("vertex_location") - _metadata = extra_kwargs.get("metadata", {}) or {} - _model_group = _metadata.get("model_group") - _deployment = _metadata.get("deployment") - extra_information = f"\nModel: {model}" - if _api_base: - extra_information += f"\nAPI Base: {_api_base}" - if messages and len(messages) > 0: - extra_information += f"\nMessages: {messages}" - - if _model_group is not None: - extra_information += f"\nmodel_group: {_model_group}\n" - if _deployment is not None: - extra_information += f"\ndeployment: {_deployment}\n" - if _vertex_project is not None: - extra_information += f"\nvertex_project: {_vertex_project}\n" - if _vertex_location is not None: - extra_information += f"\nvertex_location: {_vertex_location}\n" - - # on litellm proxy add key name + team to exceptions - extra_information = _add_key_name_and_team_to_alert( - request_info=extra_information, metadata=_metadata - ) - except: - # DO NOT LET this Block raising the original exception - pass - - ################################################################################ - # End of Common Extra information Needed for all providers - ################################################################################ - - ################################################################################ - #################### Start of Provider Exception mapping #################### - ################################################################################ - - if "Request Timeout Error" in error_str or "Request timed out" in error_str: - exception_mapping_worked = True - raise Timeout( - message=f"APITimeoutError - Request timed out. \nerror_str: {error_str}", - model=model, - llm_provider=custom_llm_provider, - litellm_debug_info=extra_information, - ) - - if ( - custom_llm_provider == "openai" - or custom_llm_provider == "text-completion-openai" - or custom_llm_provider == "custom_openai" - or custom_llm_provider in litellm.openai_compatible_providers - ): - # custom_llm_provider is openai, make it OpenAI - if hasattr(original_exception, "message"): - message = original_exception.message - else: - message = str(original_exception) - if message is not None and isinstance(message, str): - message = message.replace("OPENAI", custom_llm_provider.upper()) - message = message.replace("openai", custom_llm_provider) - message = message.replace("OpenAI", custom_llm_provider) - if custom_llm_provider == "openai": - exception_provider = "OpenAI" + "Exception" - else: - exception_provider = ( - custom_llm_provider[0].upper() - + custom_llm_provider[1:] - + "Exception" - ) - - if "This model's maximum context length is" in error_str: - exception_mapping_worked = True - raise ContextWindowExceededError( - message=f"{exception_provider} - {message}", - llm_provider=custom_llm_provider, - model=model, - response=original_exception.response, - litellm_debug_info=extra_information, - ) - elif ( - "invalid_request_error" in error_str - and "model_not_found" in error_str - ): - exception_mapping_worked = True - raise NotFoundError( - message=f"{exception_provider} - {message}", - llm_provider=custom_llm_provider, - model=model, - response=original_exception.response, - litellm_debug_info=extra_information, - ) - elif ( - "invalid_request_error" in error_str - and "content_policy_violation" in error_str - ): - exception_mapping_worked = True - raise ContentPolicyViolationError( - message=f"{exception_provider} - {message}", - llm_provider=custom_llm_provider, - model=model, - response=original_exception.response, - litellm_debug_info=extra_information, - ) - elif ( - "invalid_request_error" in error_str - and "Incorrect API key provided" not in error_str - ): - exception_mapping_worked = True - raise BadRequestError( - message=f"{exception_provider} - {message}", - llm_provider=custom_llm_provider, - model=model, - response=original_exception.response, - litellm_debug_info=extra_information, - ) - elif "Request too large" in error_str: - raise RateLimitError( - message=f"{exception_provider} - {message}", - model=model, - llm_provider=custom_llm_provider, - response=original_exception.response, - litellm_debug_info=extra_information, - ) - elif ( - "The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable" - in error_str - ): - exception_mapping_worked = True - raise AuthenticationError( - message=f"{exception_provider} - {message}", - llm_provider=custom_llm_provider, - model=model, - response=original_exception.response, - litellm_debug_info=extra_information, - ) - elif "Mistral API raised a streaming error" in error_str: - exception_mapping_worked = True - _request = httpx.Request( - method="POST", url="https://api.openai.com/v1" - ) - raise APIError( - status_code=500, - message=f"{exception_provider} - {message}", - llm_provider=custom_llm_provider, - model=model, - request=_request, - litellm_debug_info=extra_information, - ) - elif hasattr(original_exception, "status_code"): - exception_mapping_worked = True - if original_exception.status_code == 401: - exception_mapping_worked = True - raise AuthenticationError( - message=f"{exception_provider} - {message}", - llm_provider=custom_llm_provider, - model=model, - response=original_exception.response, - litellm_debug_info=extra_information, - ) - elif original_exception.status_code == 404: - exception_mapping_worked = True - raise NotFoundError( - message=f"{exception_provider} - {message}", - model=model, - llm_provider=custom_llm_provider, - response=original_exception.response, - litellm_debug_info=extra_information, - ) - elif original_exception.status_code == 408: - exception_mapping_worked = True - raise Timeout( - message=f"{exception_provider} - {message}", - model=model, - llm_provider=custom_llm_provider, - litellm_debug_info=extra_information, - ) - elif original_exception.status_code == 422: - exception_mapping_worked = True - raise BadRequestError( - message=f"{exception_provider} - {message}", - model=model, - llm_provider=custom_llm_provider, - response=original_exception.response, - litellm_debug_info=extra_information, - ) - elif original_exception.status_code == 429: - exception_mapping_worked = True - raise RateLimitError( - message=f"{exception_provider} - {message}", - model=model, - llm_provider=custom_llm_provider, - response=original_exception.response, - litellm_debug_info=extra_information, - ) - elif original_exception.status_code == 503: - exception_mapping_worked = True - raise ServiceUnavailableError( - message=f"{exception_provider} - {message}", - model=model, - llm_provider=custom_llm_provider, - response=original_exception.response, - litellm_debug_info=extra_information, - ) - elif original_exception.status_code == 504: # gateway timeout error - exception_mapping_worked = True - raise Timeout( - message=f"{exception_provider} - {message}", - model=model, - llm_provider=custom_llm_provider, - litellm_debug_info=extra_information, - ) - else: - exception_mapping_worked = True - raise APIError( - status_code=original_exception.status_code, - message=f"{exception_provider} - {message}", - llm_provider=custom_llm_provider, - model=model, - request=original_exception.request, - litellm_debug_info=extra_information, - ) - else: - # if no status code then it is an APIConnectionError: https://github.com/openai/openai-python#handling-errors - raise APIConnectionError( - message=f"{exception_provider} - {message}", - llm_provider=custom_llm_provider, - model=model, - litellm_debug_info=extra_information, - request=httpx.Request( - method="POST", url="https://api.openai.com/v1/" - ), - ) - elif custom_llm_provider == "anthropic": # one of the anthropics - if hasattr(original_exception, "message"): - if ( - "prompt is too long" in original_exception.message - or "prompt: length" in original_exception.message - ): - exception_mapping_worked = True - raise ContextWindowExceededError( - message=original_exception.message, - model=model, - llm_provider="anthropic", - response=original_exception.response, - ) - if "Invalid API Key" in original_exception.message: - exception_mapping_worked = True - raise AuthenticationError( - message=original_exception.message, - model=model, - llm_provider="anthropic", - response=original_exception.response, - ) - if hasattr(original_exception, "status_code"): - print_verbose(f"status_code: {original_exception.status_code}") - if original_exception.status_code == 401: - exception_mapping_worked = True - raise AuthenticationError( - message=f"AnthropicException - {original_exception.message}", - llm_provider="anthropic", - model=model, - response=original_exception.response, - ) - elif ( - original_exception.status_code == 400 - or original_exception.status_code == 413 - ): - exception_mapping_worked = True - raise BadRequestError( - message=f"AnthropicException - {original_exception.message}", - model=model, - llm_provider="anthropic", - response=original_exception.response, - ) - elif original_exception.status_code == 408: - exception_mapping_worked = True - raise Timeout( - message=f"AnthropicException - {original_exception.message}", - model=model, - llm_provider="anthropic", - ) - elif original_exception.status_code == 429: - exception_mapping_worked = True - raise RateLimitError( - message=f"AnthropicException - {original_exception.message}", - llm_provider="anthropic", - model=model, - response=original_exception.response, - ) - elif original_exception.status_code == 500: - exception_mapping_worked = True - raise APIError( - status_code=500, - message=f"AnthropicException - {original_exception.message}. Handle with `litellm.APIError`.", - llm_provider="anthropic", - model=model, - request=original_exception.request, - ) - elif custom_llm_provider == "replicate": - if "Incorrect authentication token" in error_str: - exception_mapping_worked = True - raise AuthenticationError( - message=f"ReplicateException - {error_str}", - llm_provider="replicate", - model=model, - response=original_exception.response, - ) - elif "input is too long" in error_str: - exception_mapping_worked = True - raise ContextWindowExceededError( - message=f"ReplicateException - {error_str}", - model=model, - llm_provider="replicate", - response=original_exception.response, - ) - elif exception_type == "ModelError": - exception_mapping_worked = True - raise BadRequestError( - message=f"ReplicateException - {error_str}", - model=model, - llm_provider="replicate", - response=original_exception.response, - ) - elif "Request was throttled" in error_str: - exception_mapping_worked = True - raise RateLimitError( - message=f"ReplicateException - {error_str}", - llm_provider="replicate", - model=model, - response=original_exception.response, - ) - elif hasattr(original_exception, "status_code"): - if original_exception.status_code == 401: - exception_mapping_worked = True - raise AuthenticationError( - message=f"ReplicateException - {original_exception.message}", - llm_provider="replicate", - model=model, - response=original_exception.response, - ) - elif ( - original_exception.status_code == 400 - or original_exception.status_code == 422 - or original_exception.status_code == 413 - ): - exception_mapping_worked = True - raise BadRequestError( - message=f"ReplicateException - {original_exception.message}", - model=model, - llm_provider="replicate", - response=original_exception.response, - ) - elif original_exception.status_code == 408: - exception_mapping_worked = True - raise Timeout( - message=f"ReplicateException - {original_exception.message}", - model=model, - llm_provider="replicate", - ) - elif original_exception.status_code == 429: - exception_mapping_worked = True - raise RateLimitError( - message=f"ReplicateException - {original_exception.message}", - llm_provider="replicate", - model=model, - response=original_exception.response, - ) - elif original_exception.status_code == 500: - exception_mapping_worked = True - raise ServiceUnavailableError( - message=f"ReplicateException - {original_exception.message}", - llm_provider="replicate", - model=model, - response=original_exception.response, - ) - exception_mapping_worked = True - raise APIError( - status_code=500, - message=f"ReplicateException - {str(original_exception)}", - llm_provider="replicate", - model=model, - request=httpx.Request( - method="POST", - url="https://api.replicate.com/v1/deployments", - ), - ) - elif custom_llm_provider == "watsonx": - if "token_quota_reached" in error_str: - exception_mapping_worked = True - raise RateLimitError( - message=f"WatsonxException: Rate Limit Errror - {error_str}", - llm_provider="watsonx", - model=model, - response=original_exception.response, - ) - elif custom_llm_provider == "predibase": - if "authorization denied for" in error_str: - exception_mapping_worked = True - - # Predibase returns the raw API Key in the response - this block ensures it's not returned in the exception - if ( - error_str is not None - and isinstance(error_str, str) - and "bearer" in error_str.lower() - ): - # only keep the first 10 chars after the occurnence of "bearer" - _bearer_token_start_index = error_str.lower().find("bearer") - error_str = error_str[: _bearer_token_start_index + 14] - error_str += "XXXXXXX" + '"' - - raise AuthenticationError( - message=f"PredibaseException: Authentication Error - {error_str}", - llm_provider="predibase", - model=model, - response=original_exception.response, - litellm_debug_info=extra_information, - ) - elif custom_llm_provider == "bedrock": - if ( - "too many tokens" in error_str - or "expected maxLength:" in error_str - or "Input is too long" in error_str - or "prompt: length: 1.." in error_str - or "Too many input tokens" in error_str - ): - exception_mapping_worked = True - raise ContextWindowExceededError( - message=f"BedrockException: Context Window Error - {error_str}", - model=model, - llm_provider="bedrock", - response=original_exception.response, - ) - if "Malformed input request" in error_str: - exception_mapping_worked = True - raise BadRequestError( - message=f"BedrockException - {error_str}", - model=model, - llm_provider="bedrock", - response=original_exception.response, - ) - if ( - "Unable to locate credentials" in error_str - or "The security token included in the request is invalid" - in error_str - ): - exception_mapping_worked = True - raise AuthenticationError( - message=f"BedrockException Invalid Authentication - {error_str}", - model=model, - llm_provider="bedrock", - response=original_exception.response, - ) - if "AccessDeniedException" in error_str: - exception_mapping_worked = True - raise PermissionDeniedError( - message=f"BedrockException PermissionDeniedError - {error_str}", - model=model, - llm_provider="bedrock", - response=original_exception.response, - ) - if ( - "throttlingException" in error_str - or "ThrottlingException" in error_str - ): - exception_mapping_worked = True - raise RateLimitError( - message=f"BedrockException: Rate Limit Error - {error_str}", - model=model, - llm_provider="bedrock", - response=original_exception.response, - ) - if "Connect timeout on endpoint URL" in error_str: - exception_mapping_worked = True - raise Timeout( - message=f"BedrockException: Timeout Error - {error_str}", - model=model, - llm_provider="bedrock", - ) - if hasattr(original_exception, "status_code"): - if original_exception.status_code == 500: - exception_mapping_worked = True - raise ServiceUnavailableError( - message=f"BedrockException - {original_exception.message}", - llm_provider="bedrock", - model=model, - response=httpx.Response( - status_code=500, - request=httpx.Request( - method="POST", url="https://api.openai.com/v1/" - ), - ), - ) - elif original_exception.status_code == 401: - exception_mapping_worked = True - raise AuthenticationError( - message=f"BedrockException - {original_exception.message}", - llm_provider="bedrock", - model=model, - response=original_exception.response, - ) - elif original_exception.status_code == 400: - exception_mapping_worked = True - raise BadRequestError( - message=f"BedrockException - {original_exception.message}", - llm_provider="bedrock", - model=model, - response=original_exception.response, - ) - elif original_exception.status_code == 404: - exception_mapping_worked = True - raise NotFoundError( - message=f"BedrockException - {original_exception.message}", - llm_provider="bedrock", - model=model, - response=original_exception.response, - ) - elif custom_llm_provider == "sagemaker": - if "Unable to locate credentials" in error_str: - exception_mapping_worked = True - raise BadRequestError( - message=f"SagemakerException - {error_str}", - model=model, - llm_provider="sagemaker", - response=original_exception.response, - ) - elif ( - "Input validation error: `best_of` must be > 0 and <= 2" - in error_str - ): - exception_mapping_worked = True - raise BadRequestError( - message=f"SagemakerException - the value of 'n' must be > 0 and <= 2 for sagemaker endpoints", - model=model, - llm_provider="sagemaker", - response=original_exception.response, - ) - elif ( - "`inputs` tokens + `max_new_tokens` must be <=" in error_str - or "instance type with more CPU capacity or memory" in error_str - ): - exception_mapping_worked = True - raise ContextWindowExceededError( - message=f"SagemakerException - {error_str}", - model=model, - llm_provider="sagemaker", - response=original_exception.response, - ) - elif custom_llm_provider == "vertex_ai": - if ( - "Vertex AI API has not been used in project" in error_str - or "Unable to find your project" in error_str - ): - exception_mapping_worked = True - raise BadRequestError( - message=f"VertexAIException - {error_str}", - model=model, - llm_provider="vertex_ai", - response=original_exception.response, - litellm_debug_info=extra_information, - ) - elif ( - "None Unknown Error." in error_str - or "Content has no parts." in error_str - ): - exception_mapping_worked = True - raise APIError( - message=f"VertexAIException - {error_str}", - status_code=500, - model=model, - llm_provider="vertex_ai", - request=original_exception.request, - litellm_debug_info=extra_information, - ) - elif "403" in error_str: - exception_mapping_worked = True - raise BadRequestError( - message=f"VertexAIException - {error_str}", - model=model, - llm_provider="vertex_ai", - response=original_exception.response, - litellm_debug_info=extra_information, - ) - elif "The response was blocked." in error_str: - exception_mapping_worked = True - raise UnprocessableEntityError( - message=f"VertexAIException - {error_str}", - model=model, - llm_provider="vertex_ai", - litellm_debug_info=extra_information, - response=httpx.Response( - status_code=429, - request=httpx.Request( - method="POST", - url=" https://cloud.google.com/vertex-ai/", - ), - ), - ) - elif ( - "429 Quota exceeded" in error_str - or "IndexError: list index out of range" in error_str - or "429 Unable to submit request because the service is temporarily out of capacity." - in error_str - ): - exception_mapping_worked = True - raise RateLimitError( - message=f"VertexAIException - {error_str}", - model=model, - llm_provider="vertex_ai", - litellm_debug_info=extra_information, - response=httpx.Response( - status_code=429, - request=httpx.Request( - method="POST", - url=" https://cloud.google.com/vertex-ai/", - ), - ), - ) - if hasattr(original_exception, "status_code"): - if original_exception.status_code == 400: - exception_mapping_worked = True - raise BadRequestError( - message=f"VertexAIException - {error_str}", - model=model, - llm_provider="vertex_ai", - litellm_debug_info=extra_information, - response=original_exception.response, - ) - if original_exception.status_code == 500: - exception_mapping_worked = True - raise APIError( - message=f"VertexAIException - {error_str}", - status_code=500, - model=model, - llm_provider="vertex_ai", - litellm_debug_info=extra_information, - request=original_exception.request, - ) - elif custom_llm_provider == "palm" or custom_llm_provider == "gemini": - if "503 Getting metadata" in error_str: - # auth errors look like this - # 503 Getting metadata from plugin failed with error: Reauthentication is needed. Please run `gcloud auth application-default login` to reauthenticate. - exception_mapping_worked = True - raise BadRequestError( - message=f"GeminiException - Invalid api key", - model=model, - llm_provider="palm", - response=original_exception.response, - ) - if ( - "504 Deadline expired before operation could complete." in error_str - or "504 Deadline Exceeded" in error_str - ): - exception_mapping_worked = True - raise Timeout( - message=f"GeminiException - {original_exception.message}", - model=model, - llm_provider="palm", - ) - if "400 Request payload size exceeds" in error_str: - exception_mapping_worked = True - raise ContextWindowExceededError( - message=f"GeminiException - {error_str}", - model=model, - llm_provider="palm", - response=original_exception.response, - ) - if ( - "500 An internal error has occurred." in error_str - or "list index out of range" in error_str - ): - exception_mapping_worked = True - raise APIError( - status_code=getattr(original_exception, "status_code", 500), - message=f"GeminiException - {original_exception.message}", - llm_provider="palm", - model=model, - request=httpx.Response( - status_code=429, - request=httpx.Request( - method="POST", - url=" https://cloud.google.com/vertex-ai/", - ), - ), - ) - if hasattr(original_exception, "status_code"): - if original_exception.status_code == 400: - exception_mapping_worked = True - raise BadRequestError( - message=f"GeminiException - {error_str}", - model=model, - llm_provider="palm", - response=original_exception.response, - ) - # Dailed: Error occurred: 400 Request payload size exceeds the limit: 20000 bytes - elif custom_llm_provider == "cloudflare": - if "Authentication error" in error_str: - exception_mapping_worked = True - raise AuthenticationError( - message=f"Cloudflare Exception - {original_exception.message}", - llm_provider="cloudflare", - model=model, - response=original_exception.response, - ) - if "must have required property" in error_str: - exception_mapping_worked = True - raise BadRequestError( - message=f"Cloudflare Exception - {original_exception.message}", - llm_provider="cloudflare", - model=model, - response=original_exception.response, - ) - elif ( - custom_llm_provider == "cohere" or custom_llm_provider == "cohere_chat" - ): # Cohere - if ( - "invalid api token" in error_str - or "No API key provided." in error_str - ): - exception_mapping_worked = True - raise AuthenticationError( - message=f"CohereException - {original_exception.message}", - llm_provider="cohere", - model=model, - response=original_exception.response, - ) - elif "too many tokens" in error_str: - exception_mapping_worked = True - raise ContextWindowExceededError( - message=f"CohereException - {original_exception.message}", - model=model, - llm_provider="cohere", - response=original_exception.response, - ) - elif hasattr(original_exception, "status_code"): - if ( - original_exception.status_code == 400 - or original_exception.status_code == 498 - ): - exception_mapping_worked = True - raise BadRequestError( - message=f"CohereException - {original_exception.message}", - llm_provider="cohere", - model=model, - response=original_exception.response, - ) - elif original_exception.status_code == 500: - exception_mapping_worked = True - raise ServiceUnavailableError( - message=f"CohereException - {original_exception.message}", - llm_provider="cohere", - model=model, - response=original_exception.response, - ) - elif ( - "CohereConnectionError" in exception_type - ): # cohere seems to fire these errors when we load test it (1k+ messages / min) - exception_mapping_worked = True - raise RateLimitError( - message=f"CohereException - {original_exception.message}", - llm_provider="cohere", - model=model, - response=original_exception.response, - ) - elif "invalid type:" in error_str: - exception_mapping_worked = True - raise BadRequestError( - message=f"CohereException - {original_exception.message}", - llm_provider="cohere", - model=model, - response=original_exception.response, - ) - elif "Unexpected server error" in error_str: - exception_mapping_worked = True - raise ServiceUnavailableError( - message=f"CohereException - {original_exception.message}", - llm_provider="cohere", - model=model, - response=original_exception.response, - ) - else: - if hasattr(original_exception, "status_code"): - exception_mapping_worked = True - raise APIError( - status_code=original_exception.status_code, - message=f"CohereException - {original_exception.message}", - llm_provider="cohere", - model=model, - request=original_exception.request, - ) - raise original_exception - elif custom_llm_provider == "huggingface": - if "length limit exceeded" in error_str: - exception_mapping_worked = True - raise ContextWindowExceededError( - message=error_str, - model=model, - llm_provider="huggingface", - response=original_exception.response, - ) - elif "A valid user token is required" in error_str: - exception_mapping_worked = True - raise BadRequestError( - message=error_str, - llm_provider="huggingface", - model=model, - response=original_exception.response, - ) - if hasattr(original_exception, "status_code"): - if original_exception.status_code == 401: - exception_mapping_worked = True - raise AuthenticationError( - message=f"HuggingfaceException - {original_exception.message}", - llm_provider="huggingface", - model=model, - response=original_exception.response, - ) - elif original_exception.status_code == 400: - exception_mapping_worked = True - raise BadRequestError( - message=f"HuggingfaceException - {original_exception.message}", - model=model, - llm_provider="huggingface", - response=original_exception.response, - ) - elif original_exception.status_code == 408: - exception_mapping_worked = True - raise Timeout( - message=f"HuggingfaceException - {original_exception.message}", - model=model, - llm_provider="huggingface", - ) - elif original_exception.status_code == 429: - exception_mapping_worked = True - raise RateLimitError( - message=f"HuggingfaceException - {original_exception.message}", - llm_provider="huggingface", - model=model, - response=original_exception.response, - ) - elif original_exception.status_code == 503: - exception_mapping_worked = True - raise ServiceUnavailableError( - message=f"HuggingfaceException - {original_exception.message}", - llm_provider="huggingface", - model=model, - response=original_exception.response, - ) - else: - exception_mapping_worked = True - raise APIError( - status_code=original_exception.status_code, - message=f"HuggingfaceException - {original_exception.message}", - llm_provider="huggingface", - model=model, - request=original_exception.request, - ) - elif custom_llm_provider == "ai21": - if hasattr(original_exception, "message"): - if "Prompt has too many tokens" in original_exception.message: - exception_mapping_worked = True - raise ContextWindowExceededError( - message=f"AI21Exception - {original_exception.message}", - model=model, - llm_provider="ai21", - response=original_exception.response, - ) - if "Bad or missing API token." in original_exception.message: - exception_mapping_worked = True - raise BadRequestError( - message=f"AI21Exception - {original_exception.message}", - model=model, - llm_provider="ai21", - response=original_exception.response, - ) - if hasattr(original_exception, "status_code"): - if original_exception.status_code == 401: - exception_mapping_worked = True - raise AuthenticationError( - message=f"AI21Exception - {original_exception.message}", - llm_provider="ai21", - model=model, - response=original_exception.response, - ) - elif original_exception.status_code == 408: - exception_mapping_worked = True - raise Timeout( - message=f"AI21Exception - {original_exception.message}", - model=model, - llm_provider="ai21", - ) - if original_exception.status_code == 422: - exception_mapping_worked = True - raise BadRequestError( - message=f"AI21Exception - {original_exception.message}", - model=model, - llm_provider="ai21", - response=original_exception.response, - ) - elif original_exception.status_code == 429: - exception_mapping_worked = True - raise RateLimitError( - message=f"AI21Exception - {original_exception.message}", - llm_provider="ai21", - model=model, - response=original_exception.response, - ) - else: - exception_mapping_worked = True - raise APIError( - status_code=original_exception.status_code, - message=f"AI21Exception - {original_exception.message}", - llm_provider="ai21", - model=model, - request=original_exception.request, - ) - elif custom_llm_provider == "nlp_cloud": - if "detail" in error_str: - if "Input text length should not exceed" in error_str: - exception_mapping_worked = True - raise ContextWindowExceededError( - message=f"NLPCloudException - {error_str}", - model=model, - llm_provider="nlp_cloud", - response=original_exception.response, - ) - elif "value is not a valid" in error_str: - exception_mapping_worked = True - raise BadRequestError( - message=f"NLPCloudException - {error_str}", - model=model, - llm_provider="nlp_cloud", - response=original_exception.response, - ) - else: - exception_mapping_worked = True - raise APIError( - status_code=500, - message=f"NLPCloudException - {error_str}", - model=model, - llm_provider="nlp_cloud", - request=original_exception.request, - ) - if hasattr( - original_exception, "status_code" - ): # https://docs.nlpcloud.com/?shell#errors - if ( - original_exception.status_code == 400 - or original_exception.status_code == 406 - or original_exception.status_code == 413 - or original_exception.status_code == 422 - ): - exception_mapping_worked = True - raise BadRequestError( - message=f"NLPCloudException - {original_exception.message}", - llm_provider="nlp_cloud", - model=model, - response=original_exception.response, - ) - elif ( - original_exception.status_code == 401 - or original_exception.status_code == 403 - ): - exception_mapping_worked = True - raise AuthenticationError( - message=f"NLPCloudException - {original_exception.message}", - llm_provider="nlp_cloud", - model=model, - response=original_exception.response, - ) - elif ( - original_exception.status_code == 522 - or original_exception.status_code == 524 - ): - exception_mapping_worked = True - raise Timeout( - message=f"NLPCloudException - {original_exception.message}", - model=model, - llm_provider="nlp_cloud", - ) - elif ( - original_exception.status_code == 429 - or original_exception.status_code == 402 - ): - exception_mapping_worked = True - raise RateLimitError( - message=f"NLPCloudException - {original_exception.message}", - llm_provider="nlp_cloud", - model=model, - response=original_exception.response, - ) - elif ( - original_exception.status_code == 500 - or original_exception.status_code == 503 - ): - exception_mapping_worked = True - raise APIError( - status_code=original_exception.status_code, - message=f"NLPCloudException - {original_exception.message}", - llm_provider="nlp_cloud", - model=model, - request=original_exception.request, - ) - elif ( - original_exception.status_code == 504 - or original_exception.status_code == 520 - ): - exception_mapping_worked = True - raise ServiceUnavailableError( - message=f"NLPCloudException - {original_exception.message}", - model=model, - llm_provider="nlp_cloud", - response=original_exception.response, - ) - else: - exception_mapping_worked = True - raise APIError( - status_code=original_exception.status_code, - message=f"NLPCloudException - {original_exception.message}", - llm_provider="nlp_cloud", - model=model, - request=original_exception.request, - ) - elif custom_llm_provider == "together_ai": - import json - - try: - error_response = json.loads(error_str) - except: - error_response = {"error": error_str} - if ( - "error" in error_response - and "`inputs` tokens + `max_new_tokens` must be <=" - in error_response["error"] - ): - exception_mapping_worked = True - raise ContextWindowExceededError( - message=f"TogetherAIException - {error_response['error']}", - model=model, - llm_provider="together_ai", - response=original_exception.response, - ) - elif ( - "error" in error_response - and "invalid private key" in error_response["error"] - ): - exception_mapping_worked = True - raise AuthenticationError( - message=f"TogetherAIException - {error_response['error']}", - llm_provider="together_ai", - model=model, - response=original_exception.response, - ) - elif ( - "error" in error_response - and "INVALID_ARGUMENT" in error_response["error"] - ): - exception_mapping_worked = True - raise BadRequestError( - message=f"TogetherAIException - {error_response['error']}", - model=model, - llm_provider="together_ai", - response=original_exception.response, - ) - - elif ( - "error" in error_response - and "API key doesn't match expected format." - in error_response["error"] - ): - exception_mapping_worked = True - raise BadRequestError( - message=f"TogetherAIException - {error_response['error']}", - model=model, - llm_provider="together_ai", - response=original_exception.response, - ) - elif ( - "error_type" in error_response - and error_response["error_type"] == "validation" - ): - exception_mapping_worked = True - raise BadRequestError( - message=f"TogetherAIException - {error_response['error']}", - model=model, - llm_provider="together_ai", - response=original_exception.response, - ) - if hasattr(original_exception, "status_code"): - if original_exception.status_code == 408: - exception_mapping_worked = True - raise Timeout( - message=f"TogetherAIException - {original_exception.message}", - model=model, - llm_provider="together_ai", - ) - elif original_exception.status_code == 422: - exception_mapping_worked = True - raise BadRequestError( - message=f"TogetherAIException - {error_response['error']}", - model=model, - llm_provider="together_ai", - response=original_exception.response, - ) - elif original_exception.status_code == 429: - exception_mapping_worked = True - raise RateLimitError( - message=f"TogetherAIException - {original_exception.message}", - llm_provider="together_ai", - model=model, - response=original_exception.response, - ) - elif original_exception.status_code == 524: - exception_mapping_worked = True - raise Timeout( - message=f"TogetherAIException - {original_exception.message}", - llm_provider="together_ai", - model=model, - ) - else: - exception_mapping_worked = True - raise APIError( - status_code=original_exception.status_code, - message=f"TogetherAIException - {original_exception.message}", - llm_provider="together_ai", - model=model, - request=original_exception.request, - ) - elif custom_llm_provider == "aleph_alpha": - if ( - "This is longer than the model's maximum context length" - in error_str - ): - exception_mapping_worked = True - raise ContextWindowExceededError( - message=f"AlephAlphaException - {original_exception.message}", - llm_provider="aleph_alpha", - model=model, - response=original_exception.response, - ) - elif "InvalidToken" in error_str or "No token provided" in error_str: - exception_mapping_worked = True - raise BadRequestError( - message=f"AlephAlphaException - {original_exception.message}", - llm_provider="aleph_alpha", - model=model, - response=original_exception.response, - ) - elif hasattr(original_exception, "status_code"): - print_verbose(f"status code: {original_exception.status_code}") - if original_exception.status_code == 401: - exception_mapping_worked = True - raise AuthenticationError( - message=f"AlephAlphaException - {original_exception.message}", - llm_provider="aleph_alpha", - model=model, - ) - elif original_exception.status_code == 400: - exception_mapping_worked = True - raise BadRequestError( - message=f"AlephAlphaException - {original_exception.message}", - llm_provider="aleph_alpha", - model=model, - response=original_exception.response, - ) - elif original_exception.status_code == 429: - exception_mapping_worked = True - raise RateLimitError( - message=f"AlephAlphaException - {original_exception.message}", - llm_provider="aleph_alpha", - model=model, - response=original_exception.response, - ) - elif original_exception.status_code == 500: - exception_mapping_worked = True - raise ServiceUnavailableError( - message=f"AlephAlphaException - {original_exception.message}", - llm_provider="aleph_alpha", - model=model, - response=original_exception.response, - ) - raise original_exception - raise original_exception - elif ( - custom_llm_provider == "ollama" or custom_llm_provider == "ollama_chat" - ): - if isinstance(original_exception, dict): - error_str = original_exception.get("error", "") - else: - error_str = str(original_exception) - if "no such file or directory" in error_str: - exception_mapping_worked = True - raise BadRequestError( - message=f"OllamaException: Invalid Model/Model not loaded - {original_exception}", - model=model, - llm_provider="ollama", - response=original_exception.response, - ) - elif "Failed to establish a new connection" in error_str: - exception_mapping_worked = True - raise ServiceUnavailableError( - message=f"OllamaException: {original_exception}", - llm_provider="ollama", - model=model, - response=original_exception.response, - ) - elif "Invalid response object from API" in error_str: - exception_mapping_worked = True - raise BadRequestError( - message=f"OllamaException: {original_exception}", - llm_provider="ollama", - model=model, - response=original_exception.response, - ) - elif "Read timed out" in error_str: - exception_mapping_worked = True - raise Timeout( - message=f"OllamaException: {original_exception}", - llm_provider="ollama", - model=model, - ) - elif custom_llm_provider == "vllm": - if hasattr(original_exception, "status_code"): - if original_exception.status_code == 0: - exception_mapping_worked = True - raise APIConnectionError( - message=f"VLLMException - {original_exception.message}", - llm_provider="vllm", - model=model, - request=original_exception.request, - ) - elif custom_llm_provider == "azure": - if "Internal server error" in error_str: - exception_mapping_worked = True - raise APIError( - status_code=500, - message=f"AzureException - {original_exception.message}", - llm_provider="azure", - model=model, - litellm_debug_info=extra_information, - request=httpx.Request(method="POST", url="https://openai.com/"), - ) - elif "This model's maximum context length is" in error_str: - exception_mapping_worked = True - raise ContextWindowExceededError( - message=f"AzureException - {original_exception.message}", - llm_provider="azure", - model=model, - litellm_debug_info=extra_information, - response=original_exception.response, - ) - elif "DeploymentNotFound" in error_str: - exception_mapping_worked = True - raise NotFoundError( - message=f"AzureException - {original_exception.message}", - llm_provider="azure", - model=model, - litellm_debug_info=extra_information, - response=original_exception.response, - ) - elif ( - "invalid_request_error" in error_str - and "content_policy_violation" in error_str - ) or ( - "The response was filtered due to the prompt triggering Azure OpenAI's content management" - in error_str - ): - exception_mapping_worked = True - raise ContentPolicyViolationError( - message=f"AzureException - {original_exception.message}", - llm_provider="azure", - model=model, - litellm_debug_info=extra_information, - response=original_exception.response, - ) - elif "invalid_request_error" in error_str: - exception_mapping_worked = True - raise BadRequestError( - message=f"AzureException - {original_exception.message}", - llm_provider="azure", - model=model, - litellm_debug_info=extra_information, - response=original_exception.response, - ) - elif ( - "The api_key client option must be set either by passing api_key to the client or by setting" - in error_str - ): - exception_mapping_worked = True - raise AuthenticationError( - message=f"{exception_provider} - {original_exception.message}", - llm_provider=custom_llm_provider, - model=model, - litellm_debug_info=extra_information, - response=original_exception.response, - ) - elif hasattr(original_exception, "status_code"): - exception_mapping_worked = True - if original_exception.status_code == 401: - exception_mapping_worked = True - raise AuthenticationError( - message=f"AzureException - {original_exception.message}", - llm_provider="azure", - model=model, - litellm_debug_info=extra_information, - response=original_exception.response, - ) - elif original_exception.status_code == 408: - exception_mapping_worked = True - raise Timeout( - message=f"AzureException - {original_exception.message}", - model=model, - litellm_debug_info=extra_information, - llm_provider="azure", - ) - if original_exception.status_code == 422: - exception_mapping_worked = True - raise BadRequestError( - message=f"AzureException - {original_exception.message}", - model=model, - llm_provider="azure", - litellm_debug_info=extra_information, - response=original_exception.response, - ) - elif original_exception.status_code == 429: - exception_mapping_worked = True - raise RateLimitError( - message=f"AzureException - {original_exception.message}", - model=model, - llm_provider="azure", - litellm_debug_info=extra_information, - response=original_exception.response, - ) - elif original_exception.status_code == 503: - exception_mapping_worked = True - raise ServiceUnavailableError( - message=f"AzureException - {original_exception.message}", - model=model, - llm_provider="azure", - litellm_debug_info=extra_information, - response=original_exception.response, - ) - elif original_exception.status_code == 504: # gateway timeout error - exception_mapping_worked = True - raise Timeout( - message=f"AzureException - {original_exception.message}", - model=model, - litellm_debug_info=extra_information, - llm_provider="azure", - ) - else: - exception_mapping_worked = True - raise APIError( - status_code=original_exception.status_code, - message=f"AzureException - {original_exception.message}", - llm_provider="azure", - litellm_debug_info=extra_information, - model=model, - request=httpx.Request( - method="POST", url="https://openai.com/" - ), - ) - else: - # if no status code then it is an APIConnectionError: https://github.com/openai/openai-python#handling-errors - raise APIConnectionError( - message=f"{exception_provider} - {message}", - llm_provider="azure", - model=model, - litellm_debug_info=extra_information, - request=httpx.Request(method="POST", url="https://openai.com/"), - ) - if ( - "BadRequestError.__init__() missing 1 required positional argument: 'param'" - in str(original_exception) - ): # deal with edge-case invalid request error bug in openai-python sdk - exception_mapping_worked = True - raise BadRequestError( - message=f"{exception_provider}: This can happen due to missing AZURE_API_VERSION: {str(original_exception)}", - model=model, - llm_provider=custom_llm_provider, - response=original_exception.response, - ) - else: # ensure generic errors always return APIConnectionError= - exception_mapping_worked = True - if hasattr(original_exception, "request"): - raise APIConnectionError( - message=f"{str(original_exception)}", - llm_provider=custom_llm_provider, - model=model, - request=original_exception.request, - ) - else: - raise APIConnectionError( - message=f"{str(original_exception)}", - llm_provider=custom_llm_provider, - model=model, - request=httpx.Request( - method="POST", url="https://api.openai.com/v1/" - ), # stub the request - ) - except Exception as e: - # LOGGING - exception_logging( - logger_fn=user_logger_fn, - additional_args={ - "exception_mapping_worked": exception_mapping_worked, - "original_exception": original_exception, - }, - exception=e, - ) - ## AUTH ERROR - if isinstance(e, AuthenticationError) and ( - litellm.email or "LITELLM_EMAIL" in os.environ - ): - threading.Thread(target=get_all_keys, args=(e.llm_provider,)).start() - # don't let an error with mapping interrupt the user from receiving an error from the llm api calls - if exception_mapping_worked: -> raise e - -../utils.py:9661: -_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ - -model = 'amazon.titan-tg1-large' -original_exception = AttributeError("'NoneType' object has no attribute 'get'") -custom_llm_provider = 'bedrock', completion_kwargs = {}, extra_kwargs = {} - - def exception_type( - model, - original_exception, - custom_llm_provider, - completion_kwargs={}, - extra_kwargs={}, - ): - global user_logger_fn, liteDebuggerClient - exception_mapping_worked = False - if litellm.suppress_debug_info is False: - print() # noqa - print( # noqa - "\033[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new\033[0m" # noqa - ) # noqa - print( # noqa - "LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'." # noqa - ) # noqa - print() # noqa - try: - if model: - error_str = str(original_exception) - if isinstance(original_exception, BaseException): - exception_type = type(original_exception).__name__ - else: - exception_type = "" - - ################################################################################ - # Common Extra information needed for all providers - # We pass num retries, api_base, vertex_deployment etc to the exception here - ################################################################################ - extra_information = "" - try: - _api_base = litellm.get_api_base( - model=model, optional_params=extra_kwargs - ) - messages = litellm.get_first_chars_messages(kwargs=completion_kwargs) - _vertex_project = extra_kwargs.get("vertex_project") - _vertex_location = extra_kwargs.get("vertex_location") - _metadata = extra_kwargs.get("metadata", {}) or {} - _model_group = _metadata.get("model_group") - _deployment = _metadata.get("deployment") - extra_information = f"\nModel: {model}" - if _api_base: - extra_information += f"\nAPI Base: {_api_base}" - if messages and len(messages) > 0: - extra_information += f"\nMessages: {messages}" - - if _model_group is not None: - extra_information += f"\nmodel_group: {_model_group}\n" - if _deployment is not None: - extra_information += f"\ndeployment: {_deployment}\n" - if _vertex_project is not None: - extra_information += f"\nvertex_project: {_vertex_project}\n" - if _vertex_location is not None: - extra_information += f"\nvertex_location: {_vertex_location}\n" - - # on litellm proxy add key name + team to exceptions - extra_information = _add_key_name_and_team_to_alert( - request_info=extra_information, metadata=_metadata - ) - except: - # DO NOT LET this Block raising the original exception - pass - - ################################################################################ - # End of Common Extra information Needed for all providers - ################################################################################ - - ################################################################################ - #################### Start of Provider Exception mapping #################### - ################################################################################ - - if "Request Timeout Error" in error_str or "Request timed out" in error_str: - exception_mapping_worked = True - raise Timeout( - message=f"APITimeoutError - Request timed out. \nerror_str: {error_str}", - model=model, - llm_provider=custom_llm_provider, - litellm_debug_info=extra_information, - ) - - if ( - custom_llm_provider == "openai" - or custom_llm_provider == "text-completion-openai" - or custom_llm_provider == "custom_openai" - or custom_llm_provider in litellm.openai_compatible_providers - ): - # custom_llm_provider is openai, make it OpenAI - if hasattr(original_exception, "message"): - message = original_exception.message - else: - message = str(original_exception) - if message is not None and isinstance(message, str): - message = message.replace("OPENAI", custom_llm_provider.upper()) - message = message.replace("openai", custom_llm_provider) - message = message.replace("OpenAI", custom_llm_provider) - if custom_llm_provider == "openai": - exception_provider = "OpenAI" + "Exception" - else: - exception_provider = ( - custom_llm_provider[0].upper() - + custom_llm_provider[1:] - + "Exception" - ) - - if "This model's maximum context length is" in error_str: - exception_mapping_worked = True - raise ContextWindowExceededError( - message=f"{exception_provider} - {message}", - llm_provider=custom_llm_provider, - model=model, - response=original_exception.response, - litellm_debug_info=extra_information, - ) - elif ( - "invalid_request_error" in error_str - and "model_not_found" in error_str - ): - exception_mapping_worked = True - raise NotFoundError( - message=f"{exception_provider} - {message}", - llm_provider=custom_llm_provider, - model=model, - response=original_exception.response, - litellm_debug_info=extra_information, - ) - elif ( - "invalid_request_error" in error_str - and "content_policy_violation" in error_str - ): - exception_mapping_worked = True - raise ContentPolicyViolationError( - message=f"{exception_provider} - {message}", - llm_provider=custom_llm_provider, - model=model, - response=original_exception.response, - litellm_debug_info=extra_information, - ) - elif ( - "invalid_request_error" in error_str - and "Incorrect API key provided" not in error_str - ): - exception_mapping_worked = True - raise BadRequestError( - message=f"{exception_provider} - {message}", - llm_provider=custom_llm_provider, - model=model, - response=original_exception.response, - litellm_debug_info=extra_information, - ) - elif "Request too large" in error_str: - raise RateLimitError( - message=f"{exception_provider} - {message}", - model=model, - llm_provider=custom_llm_provider, - response=original_exception.response, - litellm_debug_info=extra_information, - ) - elif ( - "The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable" - in error_str - ): - exception_mapping_worked = True - raise AuthenticationError( - message=f"{exception_provider} - {message}", - llm_provider=custom_llm_provider, - model=model, - response=original_exception.response, - litellm_debug_info=extra_information, - ) - elif "Mistral API raised a streaming error" in error_str: - exception_mapping_worked = True - _request = httpx.Request( - method="POST", url="https://api.openai.com/v1" - ) - raise APIError( - status_code=500, - message=f"{exception_provider} - {message}", - llm_provider=custom_llm_provider, - model=model, - request=_request, - litellm_debug_info=extra_information, - ) - elif hasattr(original_exception, "status_code"): - exception_mapping_worked = True - if original_exception.status_code == 401: - exception_mapping_worked = True - raise AuthenticationError( - message=f"{exception_provider} - {message}", - llm_provider=custom_llm_provider, - model=model, - response=original_exception.response, - litellm_debug_info=extra_information, - ) - elif original_exception.status_code == 404: - exception_mapping_worked = True - raise NotFoundError( - message=f"{exception_provider} - {message}", - model=model, - llm_provider=custom_llm_provider, - response=original_exception.response, - litellm_debug_info=extra_information, - ) - elif original_exception.status_code == 408: - exception_mapping_worked = True - raise Timeout( - message=f"{exception_provider} - {message}", - model=model, - llm_provider=custom_llm_provider, - litellm_debug_info=extra_information, - ) - elif original_exception.status_code == 422: - exception_mapping_worked = True - raise BadRequestError( - message=f"{exception_provider} - {message}", - model=model, - llm_provider=custom_llm_provider, - response=original_exception.response, - litellm_debug_info=extra_information, - ) - elif original_exception.status_code == 429: - exception_mapping_worked = True - raise RateLimitError( - message=f"{exception_provider} - {message}", - model=model, - llm_provider=custom_llm_provider, - response=original_exception.response, - litellm_debug_info=extra_information, - ) - elif original_exception.status_code == 503: - exception_mapping_worked = True - raise ServiceUnavailableError( - message=f"{exception_provider} - {message}", - model=model, - llm_provider=custom_llm_provider, - response=original_exception.response, - litellm_debug_info=extra_information, - ) - elif original_exception.status_code == 504: # gateway timeout error - exception_mapping_worked = True - raise Timeout( - message=f"{exception_provider} - {message}", - model=model, - llm_provider=custom_llm_provider, - litellm_debug_info=extra_information, - ) - else: - exception_mapping_worked = True - raise APIError( - status_code=original_exception.status_code, - message=f"{exception_provider} - {message}", - llm_provider=custom_llm_provider, - model=model, - request=original_exception.request, - litellm_debug_info=extra_information, - ) - else: - # if no status code then it is an APIConnectionError: https://github.com/openai/openai-python#handling-errors - raise APIConnectionError( - message=f"{exception_provider} - {message}", - llm_provider=custom_llm_provider, - model=model, - litellm_debug_info=extra_information, - request=httpx.Request( - method="POST", url="https://api.openai.com/v1/" - ), - ) - elif custom_llm_provider == "anthropic": # one of the anthropics - if hasattr(original_exception, "message"): - if ( - "prompt is too long" in original_exception.message - or "prompt: length" in original_exception.message - ): - exception_mapping_worked = True - raise ContextWindowExceededError( - message=original_exception.message, - model=model, - llm_provider="anthropic", - response=original_exception.response, - ) - if "Invalid API Key" in original_exception.message: - exception_mapping_worked = True - raise AuthenticationError( - message=original_exception.message, - model=model, - llm_provider="anthropic", - response=original_exception.response, - ) - if hasattr(original_exception, "status_code"): - print_verbose(f"status_code: {original_exception.status_code}") - if original_exception.status_code == 401: - exception_mapping_worked = True - raise AuthenticationError( - message=f"AnthropicException - {original_exception.message}", - llm_provider="anthropic", - model=model, - response=original_exception.response, - ) - elif ( - original_exception.status_code == 400 - or original_exception.status_code == 413 - ): - exception_mapping_worked = True - raise BadRequestError( - message=f"AnthropicException - {original_exception.message}", - model=model, - llm_provider="anthropic", - response=original_exception.response, - ) - elif original_exception.status_code == 408: - exception_mapping_worked = True - raise Timeout( - message=f"AnthropicException - {original_exception.message}", - model=model, - llm_provider="anthropic", - ) - elif original_exception.status_code == 429: - exception_mapping_worked = True - raise RateLimitError( - message=f"AnthropicException - {original_exception.message}", - llm_provider="anthropic", - model=model, - response=original_exception.response, - ) - elif original_exception.status_code == 500: - exception_mapping_worked = True - raise APIError( - status_code=500, - message=f"AnthropicException - {original_exception.message}. Handle with `litellm.APIError`.", - llm_provider="anthropic", - model=model, - request=original_exception.request, - ) - elif custom_llm_provider == "replicate": - if "Incorrect authentication token" in error_str: - exception_mapping_worked = True - raise AuthenticationError( - message=f"ReplicateException - {error_str}", - llm_provider="replicate", - model=model, - response=original_exception.response, - ) - elif "input is too long" in error_str: - exception_mapping_worked = True - raise ContextWindowExceededError( - message=f"ReplicateException - {error_str}", - model=model, - llm_provider="replicate", - response=original_exception.response, - ) - elif exception_type == "ModelError": - exception_mapping_worked = True - raise BadRequestError( - message=f"ReplicateException - {error_str}", - model=model, - llm_provider="replicate", - response=original_exception.response, - ) - elif "Request was throttled" in error_str: - exception_mapping_worked = True - raise RateLimitError( - message=f"ReplicateException - {error_str}", - llm_provider="replicate", - model=model, - response=original_exception.response, - ) - elif hasattr(original_exception, "status_code"): - if original_exception.status_code == 401: - exception_mapping_worked = True - raise AuthenticationError( - message=f"ReplicateException - {original_exception.message}", - llm_provider="replicate", - model=model, - response=original_exception.response, - ) - elif ( - original_exception.status_code == 400 - or original_exception.status_code == 422 - or original_exception.status_code == 413 - ): - exception_mapping_worked = True - raise BadRequestError( - message=f"ReplicateException - {original_exception.message}", - model=model, - llm_provider="replicate", - response=original_exception.response, - ) - elif original_exception.status_code == 408: - exception_mapping_worked = True - raise Timeout( - message=f"ReplicateException - {original_exception.message}", - model=model, - llm_provider="replicate", - ) - elif original_exception.status_code == 429: - exception_mapping_worked = True - raise RateLimitError( - message=f"ReplicateException - {original_exception.message}", - llm_provider="replicate", - model=model, - response=original_exception.response, - ) - elif original_exception.status_code == 500: - exception_mapping_worked = True - raise ServiceUnavailableError( - message=f"ReplicateException - {original_exception.message}", - llm_provider="replicate", - model=model, - response=original_exception.response, - ) - exception_mapping_worked = True - raise APIError( - status_code=500, - message=f"ReplicateException - {str(original_exception)}", - llm_provider="replicate", - model=model, - request=httpx.Request( - method="POST", - url="https://api.replicate.com/v1/deployments", - ), - ) - elif custom_llm_provider == "watsonx": - if "token_quota_reached" in error_str: - exception_mapping_worked = True - raise RateLimitError( - message=f"WatsonxException: Rate Limit Errror - {error_str}", - llm_provider="watsonx", - model=model, - response=original_exception.response, - ) - elif custom_llm_provider == "predibase": - if "authorization denied for" in error_str: - exception_mapping_worked = True - - # Predibase returns the raw API Key in the response - this block ensures it's not returned in the exception - if ( - error_str is not None - and isinstance(error_str, str) - and "bearer" in error_str.lower() - ): - # only keep the first 10 chars after the occurnence of "bearer" - _bearer_token_start_index = error_str.lower().find("bearer") - error_str = error_str[: _bearer_token_start_index + 14] - error_str += "XXXXXXX" + '"' - - raise AuthenticationError( - message=f"PredibaseException: Authentication Error - {error_str}", - llm_provider="predibase", - model=model, - response=original_exception.response, - litellm_debug_info=extra_information, - ) - elif custom_llm_provider == "bedrock": - if ( - "too many tokens" in error_str - or "expected maxLength:" in error_str - or "Input is too long" in error_str - or "prompt: length: 1.." in error_str - or "Too many input tokens" in error_str - ): - exception_mapping_worked = True - raise ContextWindowExceededError( - message=f"BedrockException: Context Window Error - {error_str}", - model=model, - llm_provider="bedrock", - response=original_exception.response, - ) - if "Malformed input request" in error_str: - exception_mapping_worked = True - raise BadRequestError( - message=f"BedrockException - {error_str}", - model=model, - llm_provider="bedrock", - response=original_exception.response, - ) - if ( - "Unable to locate credentials" in error_str - or "The security token included in the request is invalid" - in error_str - ): - exception_mapping_worked = True - raise AuthenticationError( - message=f"BedrockException Invalid Authentication - {error_str}", - model=model, - llm_provider="bedrock", - response=original_exception.response, - ) - if "AccessDeniedException" in error_str: - exception_mapping_worked = True - raise PermissionDeniedError( - message=f"BedrockException PermissionDeniedError - {error_str}", - model=model, - llm_provider="bedrock", - response=original_exception.response, - ) - if ( - "throttlingException" in error_str - or "ThrottlingException" in error_str - ): - exception_mapping_worked = True - raise RateLimitError( - message=f"BedrockException: Rate Limit Error - {error_str}", - model=model, - llm_provider="bedrock", - response=original_exception.response, - ) - if "Connect timeout on endpoint URL" in error_str: - exception_mapping_worked = True - raise Timeout( - message=f"BedrockException: Timeout Error - {error_str}", - model=model, - llm_provider="bedrock", - ) - if hasattr(original_exception, "status_code"): - if original_exception.status_code == 500: - exception_mapping_worked = True - raise ServiceUnavailableError( - message=f"BedrockException - {original_exception.message}", - llm_provider="bedrock", - model=model, - response=httpx.Response( - status_code=500, - request=httpx.Request( - method="POST", url="https://api.openai.com/v1/" - ), - ), - ) - elif original_exception.status_code == 401: - exception_mapping_worked = True - raise AuthenticationError( - message=f"BedrockException - {original_exception.message}", - llm_provider="bedrock", - model=model, - response=original_exception.response, - ) - elif original_exception.status_code == 400: - exception_mapping_worked = True - raise BadRequestError( - message=f"BedrockException - {original_exception.message}", - llm_provider="bedrock", - model=model, - response=original_exception.response, - ) - elif original_exception.status_code == 404: - exception_mapping_worked = True - raise NotFoundError( - message=f"BedrockException - {original_exception.message}", - llm_provider="bedrock", - model=model, - response=original_exception.response, - ) - elif custom_llm_provider == "sagemaker": - if "Unable to locate credentials" in error_str: - exception_mapping_worked = True - raise BadRequestError( - message=f"SagemakerException - {error_str}", - model=model, - llm_provider="sagemaker", - response=original_exception.response, - ) - elif ( - "Input validation error: `best_of` must be > 0 and <= 2" - in error_str - ): - exception_mapping_worked = True - raise BadRequestError( - message=f"SagemakerException - the value of 'n' must be > 0 and <= 2 for sagemaker endpoints", - model=model, - llm_provider="sagemaker", - response=original_exception.response, - ) - elif ( - "`inputs` tokens + `max_new_tokens` must be <=" in error_str - or "instance type with more CPU capacity or memory" in error_str - ): - exception_mapping_worked = True - raise ContextWindowExceededError( - message=f"SagemakerException - {error_str}", - model=model, - llm_provider="sagemaker", - response=original_exception.response, - ) - elif custom_llm_provider == "vertex_ai": - if ( - "Vertex AI API has not been used in project" in error_str - or "Unable to find your project" in error_str - ): - exception_mapping_worked = True - raise BadRequestError( - message=f"VertexAIException - {error_str}", - model=model, - llm_provider="vertex_ai", - response=original_exception.response, - litellm_debug_info=extra_information, - ) - elif ( - "None Unknown Error." in error_str - or "Content has no parts." in error_str - ): - exception_mapping_worked = True - raise APIError( - message=f"VertexAIException - {error_str}", - status_code=500, - model=model, - llm_provider="vertex_ai", - request=original_exception.request, - litellm_debug_info=extra_information, - ) - elif "403" in error_str: - exception_mapping_worked = True - raise BadRequestError( - message=f"VertexAIException - {error_str}", - model=model, - llm_provider="vertex_ai", - response=original_exception.response, - litellm_debug_info=extra_information, - ) - elif "The response was blocked." in error_str: - exception_mapping_worked = True - raise UnprocessableEntityError( - message=f"VertexAIException - {error_str}", - model=model, - llm_provider="vertex_ai", - litellm_debug_info=extra_information, - response=httpx.Response( - status_code=429, - request=httpx.Request( - method="POST", - url=" https://cloud.google.com/vertex-ai/", - ), - ), - ) - elif ( - "429 Quota exceeded" in error_str - or "IndexError: list index out of range" in error_str - or "429 Unable to submit request because the service is temporarily out of capacity." - in error_str - ): - exception_mapping_worked = True - raise RateLimitError( - message=f"VertexAIException - {error_str}", - model=model, - llm_provider="vertex_ai", - litellm_debug_info=extra_information, - response=httpx.Response( - status_code=429, - request=httpx.Request( - method="POST", - url=" https://cloud.google.com/vertex-ai/", - ), - ), - ) - if hasattr(original_exception, "status_code"): - if original_exception.status_code == 400: - exception_mapping_worked = True - raise BadRequestError( - message=f"VertexAIException - {error_str}", - model=model, - llm_provider="vertex_ai", - litellm_debug_info=extra_information, - response=original_exception.response, - ) - if original_exception.status_code == 500: - exception_mapping_worked = True - raise APIError( - message=f"VertexAIException - {error_str}", - status_code=500, - model=model, - llm_provider="vertex_ai", - litellm_debug_info=extra_information, - request=original_exception.request, - ) - elif custom_llm_provider == "palm" or custom_llm_provider == "gemini": - if "503 Getting metadata" in error_str: - # auth errors look like this - # 503 Getting metadata from plugin failed with error: Reauthentication is needed. Please run `gcloud auth application-default login` to reauthenticate. - exception_mapping_worked = True - raise BadRequestError( - message=f"GeminiException - Invalid api key", - model=model, - llm_provider="palm", - response=original_exception.response, - ) - if ( - "504 Deadline expired before operation could complete." in error_str - or "504 Deadline Exceeded" in error_str - ): - exception_mapping_worked = True - raise Timeout( - message=f"GeminiException - {original_exception.message}", - model=model, - llm_provider="palm", - ) - if "400 Request payload size exceeds" in error_str: - exception_mapping_worked = True - raise ContextWindowExceededError( - message=f"GeminiException - {error_str}", - model=model, - llm_provider="palm", - response=original_exception.response, - ) - if ( - "500 An internal error has occurred." in error_str - or "list index out of range" in error_str - ): - exception_mapping_worked = True - raise APIError( - status_code=getattr(original_exception, "status_code", 500), - message=f"GeminiException - {original_exception.message}", - llm_provider="palm", - model=model, - request=httpx.Response( - status_code=429, - request=httpx.Request( - method="POST", - url=" https://cloud.google.com/vertex-ai/", - ), - ), - ) - if hasattr(original_exception, "status_code"): - if original_exception.status_code == 400: - exception_mapping_worked = True - raise BadRequestError( - message=f"GeminiException - {error_str}", - model=model, - llm_provider="palm", - response=original_exception.response, - ) - # Dailed: Error occurred: 400 Request payload size exceeds the limit: 20000 bytes - elif custom_llm_provider == "cloudflare": - if "Authentication error" in error_str: - exception_mapping_worked = True - raise AuthenticationError( - message=f"Cloudflare Exception - {original_exception.message}", - llm_provider="cloudflare", - model=model, - response=original_exception.response, - ) - if "must have required property" in error_str: - exception_mapping_worked = True - raise BadRequestError( - message=f"Cloudflare Exception - {original_exception.message}", - llm_provider="cloudflare", - model=model, - response=original_exception.response, - ) - elif ( - custom_llm_provider == "cohere" or custom_llm_provider == "cohere_chat" - ): # Cohere - if ( - "invalid api token" in error_str - or "No API key provided." in error_str - ): - exception_mapping_worked = True - raise AuthenticationError( - message=f"CohereException - {original_exception.message}", - llm_provider="cohere", - model=model, - response=original_exception.response, - ) - elif "too many tokens" in error_str: - exception_mapping_worked = True - raise ContextWindowExceededError( - message=f"CohereException - {original_exception.message}", - model=model, - llm_provider="cohere", - response=original_exception.response, - ) - elif hasattr(original_exception, "status_code"): - if ( - original_exception.status_code == 400 - or original_exception.status_code == 498 - ): - exception_mapping_worked = True - raise BadRequestError( - message=f"CohereException - {original_exception.message}", - llm_provider="cohere", - model=model, - response=original_exception.response, - ) - elif original_exception.status_code == 500: - exception_mapping_worked = True - raise ServiceUnavailableError( - message=f"CohereException - {original_exception.message}", - llm_provider="cohere", - model=model, - response=original_exception.response, - ) - elif ( - "CohereConnectionError" in exception_type - ): # cohere seems to fire these errors when we load test it (1k+ messages / min) - exception_mapping_worked = True - raise RateLimitError( - message=f"CohereException - {original_exception.message}", - llm_provider="cohere", - model=model, - response=original_exception.response, - ) - elif "invalid type:" in error_str: - exception_mapping_worked = True - raise BadRequestError( - message=f"CohereException - {original_exception.message}", - llm_provider="cohere", - model=model, - response=original_exception.response, - ) - elif "Unexpected server error" in error_str: - exception_mapping_worked = True - raise ServiceUnavailableError( - message=f"CohereException - {original_exception.message}", - llm_provider="cohere", - model=model, - response=original_exception.response, - ) - else: - if hasattr(original_exception, "status_code"): - exception_mapping_worked = True - raise APIError( - status_code=original_exception.status_code, - message=f"CohereException - {original_exception.message}", - llm_provider="cohere", - model=model, - request=original_exception.request, - ) - raise original_exception - elif custom_llm_provider == "huggingface": - if "length limit exceeded" in error_str: - exception_mapping_worked = True - raise ContextWindowExceededError( - message=error_str, - model=model, - llm_provider="huggingface", - response=original_exception.response, - ) - elif "A valid user token is required" in error_str: - exception_mapping_worked = True - raise BadRequestError( - message=error_str, - llm_provider="huggingface", - model=model, - response=original_exception.response, - ) - if hasattr(original_exception, "status_code"): - if original_exception.status_code == 401: - exception_mapping_worked = True - raise AuthenticationError( - message=f"HuggingfaceException - {original_exception.message}", - llm_provider="huggingface", - model=model, - response=original_exception.response, - ) - elif original_exception.status_code == 400: - exception_mapping_worked = True - raise BadRequestError( - message=f"HuggingfaceException - {original_exception.message}", - model=model, - llm_provider="huggingface", - response=original_exception.response, - ) - elif original_exception.status_code == 408: - exception_mapping_worked = True - raise Timeout( - message=f"HuggingfaceException - {original_exception.message}", - model=model, - llm_provider="huggingface", - ) - elif original_exception.status_code == 429: - exception_mapping_worked = True - raise RateLimitError( - message=f"HuggingfaceException - {original_exception.message}", - llm_provider="huggingface", - model=model, - response=original_exception.response, - ) - elif original_exception.status_code == 503: - exception_mapping_worked = True - raise ServiceUnavailableError( - message=f"HuggingfaceException - {original_exception.message}", - llm_provider="huggingface", - model=model, - response=original_exception.response, - ) - else: - exception_mapping_worked = True - raise APIError( - status_code=original_exception.status_code, - message=f"HuggingfaceException - {original_exception.message}", - llm_provider="huggingface", - model=model, - request=original_exception.request, - ) - elif custom_llm_provider == "ai21": - if hasattr(original_exception, "message"): - if "Prompt has too many tokens" in original_exception.message: - exception_mapping_worked = True - raise ContextWindowExceededError( - message=f"AI21Exception - {original_exception.message}", - model=model, - llm_provider="ai21", - response=original_exception.response, - ) - if "Bad or missing API token." in original_exception.message: - exception_mapping_worked = True - raise BadRequestError( - message=f"AI21Exception - {original_exception.message}", - model=model, - llm_provider="ai21", - response=original_exception.response, - ) - if hasattr(original_exception, "status_code"): - if original_exception.status_code == 401: - exception_mapping_worked = True - raise AuthenticationError( - message=f"AI21Exception - {original_exception.message}", - llm_provider="ai21", - model=model, - response=original_exception.response, - ) - elif original_exception.status_code == 408: - exception_mapping_worked = True - raise Timeout( - message=f"AI21Exception - {original_exception.message}", - model=model, - llm_provider="ai21", - ) - if original_exception.status_code == 422: - exception_mapping_worked = True - raise BadRequestError( - message=f"AI21Exception - {original_exception.message}", - model=model, - llm_provider="ai21", - response=original_exception.response, - ) - elif original_exception.status_code == 429: - exception_mapping_worked = True - raise RateLimitError( - message=f"AI21Exception - {original_exception.message}", - llm_provider="ai21", - model=model, - response=original_exception.response, - ) - else: - exception_mapping_worked = True - raise APIError( - status_code=original_exception.status_code, - message=f"AI21Exception - {original_exception.message}", - llm_provider="ai21", - model=model, - request=original_exception.request, - ) - elif custom_llm_provider == "nlp_cloud": - if "detail" in error_str: - if "Input text length should not exceed" in error_str: - exception_mapping_worked = True - raise ContextWindowExceededError( - message=f"NLPCloudException - {error_str}", - model=model, - llm_provider="nlp_cloud", - response=original_exception.response, - ) - elif "value is not a valid" in error_str: - exception_mapping_worked = True - raise BadRequestError( - message=f"NLPCloudException - {error_str}", - model=model, - llm_provider="nlp_cloud", - response=original_exception.response, - ) - else: - exception_mapping_worked = True - raise APIError( - status_code=500, - message=f"NLPCloudException - {error_str}", - model=model, - llm_provider="nlp_cloud", - request=original_exception.request, - ) - if hasattr( - original_exception, "status_code" - ): # https://docs.nlpcloud.com/?shell#errors - if ( - original_exception.status_code == 400 - or original_exception.status_code == 406 - or original_exception.status_code == 413 - or original_exception.status_code == 422 - ): - exception_mapping_worked = True - raise BadRequestError( - message=f"NLPCloudException - {original_exception.message}", - llm_provider="nlp_cloud", - model=model, - response=original_exception.response, - ) - elif ( - original_exception.status_code == 401 - or original_exception.status_code == 403 - ): - exception_mapping_worked = True - raise AuthenticationError( - message=f"NLPCloudException - {original_exception.message}", - llm_provider="nlp_cloud", - model=model, - response=original_exception.response, - ) - elif ( - original_exception.status_code == 522 - or original_exception.status_code == 524 - ): - exception_mapping_worked = True - raise Timeout( - message=f"NLPCloudException - {original_exception.message}", - model=model, - llm_provider="nlp_cloud", - ) - elif ( - original_exception.status_code == 429 - or original_exception.status_code == 402 - ): - exception_mapping_worked = True - raise RateLimitError( - message=f"NLPCloudException - {original_exception.message}", - llm_provider="nlp_cloud", - model=model, - response=original_exception.response, - ) - elif ( - original_exception.status_code == 500 - or original_exception.status_code == 503 - ): - exception_mapping_worked = True - raise APIError( - status_code=original_exception.status_code, - message=f"NLPCloudException - {original_exception.message}", - llm_provider="nlp_cloud", - model=model, - request=original_exception.request, - ) - elif ( - original_exception.status_code == 504 - or original_exception.status_code == 520 - ): - exception_mapping_worked = True - raise ServiceUnavailableError( - message=f"NLPCloudException - {original_exception.message}", - model=model, - llm_provider="nlp_cloud", - response=original_exception.response, - ) - else: - exception_mapping_worked = True - raise APIError( - status_code=original_exception.status_code, - message=f"NLPCloudException - {original_exception.message}", - llm_provider="nlp_cloud", - model=model, - request=original_exception.request, - ) - elif custom_llm_provider == "together_ai": - import json - - try: - error_response = json.loads(error_str) - except: - error_response = {"error": error_str} - if ( - "error" in error_response - and "`inputs` tokens + `max_new_tokens` must be <=" - in error_response["error"] - ): - exception_mapping_worked = True - raise ContextWindowExceededError( - message=f"TogetherAIException - {error_response['error']}", - model=model, - llm_provider="together_ai", - response=original_exception.response, - ) - elif ( - "error" in error_response - and "invalid private key" in error_response["error"] - ): - exception_mapping_worked = True - raise AuthenticationError( - message=f"TogetherAIException - {error_response['error']}", - llm_provider="together_ai", - model=model, - response=original_exception.response, - ) - elif ( - "error" in error_response - and "INVALID_ARGUMENT" in error_response["error"] - ): - exception_mapping_worked = True - raise BadRequestError( - message=f"TogetherAIException - {error_response['error']}", - model=model, - llm_provider="together_ai", - response=original_exception.response, - ) - - elif ( - "error" in error_response - and "API key doesn't match expected format." - in error_response["error"] - ): - exception_mapping_worked = True - raise BadRequestError( - message=f"TogetherAIException - {error_response['error']}", - model=model, - llm_provider="together_ai", - response=original_exception.response, - ) - elif ( - "error_type" in error_response - and error_response["error_type"] == "validation" - ): - exception_mapping_worked = True - raise BadRequestError( - message=f"TogetherAIException - {error_response['error']}", - model=model, - llm_provider="together_ai", - response=original_exception.response, - ) - if hasattr(original_exception, "status_code"): - if original_exception.status_code == 408: - exception_mapping_worked = True - raise Timeout( - message=f"TogetherAIException - {original_exception.message}", - model=model, - llm_provider="together_ai", - ) - elif original_exception.status_code == 422: - exception_mapping_worked = True - raise BadRequestError( - message=f"TogetherAIException - {error_response['error']}", - model=model, - llm_provider="together_ai", - response=original_exception.response, - ) - elif original_exception.status_code == 429: - exception_mapping_worked = True - raise RateLimitError( - message=f"TogetherAIException - {original_exception.message}", - llm_provider="together_ai", - model=model, - response=original_exception.response, - ) - elif original_exception.status_code == 524: - exception_mapping_worked = True - raise Timeout( - message=f"TogetherAIException - {original_exception.message}", - llm_provider="together_ai", - model=model, - ) - else: - exception_mapping_worked = True - raise APIError( - status_code=original_exception.status_code, - message=f"TogetherAIException - {original_exception.message}", - llm_provider="together_ai", - model=model, - request=original_exception.request, - ) - elif custom_llm_provider == "aleph_alpha": - if ( - "This is longer than the model's maximum context length" - in error_str - ): - exception_mapping_worked = True - raise ContextWindowExceededError( - message=f"AlephAlphaException - {original_exception.message}", - llm_provider="aleph_alpha", - model=model, - response=original_exception.response, - ) - elif "InvalidToken" in error_str or "No token provided" in error_str: - exception_mapping_worked = True - raise BadRequestError( - message=f"AlephAlphaException - {original_exception.message}", - llm_provider="aleph_alpha", - model=model, - response=original_exception.response, - ) - elif hasattr(original_exception, "status_code"): - print_verbose(f"status code: {original_exception.status_code}") - if original_exception.status_code == 401: - exception_mapping_worked = True - raise AuthenticationError( - message=f"AlephAlphaException - {original_exception.message}", - llm_provider="aleph_alpha", - model=model, - ) - elif original_exception.status_code == 400: - exception_mapping_worked = True - raise BadRequestError( - message=f"AlephAlphaException - {original_exception.message}", - llm_provider="aleph_alpha", - model=model, - response=original_exception.response, - ) - elif original_exception.status_code == 429: - exception_mapping_worked = True - raise RateLimitError( - message=f"AlephAlphaException - {original_exception.message}", - llm_provider="aleph_alpha", - model=model, - response=original_exception.response, - ) - elif original_exception.status_code == 500: - exception_mapping_worked = True - raise ServiceUnavailableError( - message=f"AlephAlphaException - {original_exception.message}", - llm_provider="aleph_alpha", - model=model, - response=original_exception.response, - ) - raise original_exception - raise original_exception - elif ( - custom_llm_provider == "ollama" or custom_llm_provider == "ollama_chat" - ): - if isinstance(original_exception, dict): - error_str = original_exception.get("error", "") - else: - error_str = str(original_exception) - if "no such file or directory" in error_str: - exception_mapping_worked = True - raise BadRequestError( - message=f"OllamaException: Invalid Model/Model not loaded - {original_exception}", - model=model, - llm_provider="ollama", - response=original_exception.response, - ) - elif "Failed to establish a new connection" in error_str: - exception_mapping_worked = True - raise ServiceUnavailableError( - message=f"OllamaException: {original_exception}", - llm_provider="ollama", - model=model, - response=original_exception.response, - ) - elif "Invalid response object from API" in error_str: - exception_mapping_worked = True - raise BadRequestError( - message=f"OllamaException: {original_exception}", - llm_provider="ollama", - model=model, - response=original_exception.response, - ) - elif "Read timed out" in error_str: - exception_mapping_worked = True - raise Timeout( - message=f"OllamaException: {original_exception}", - llm_provider="ollama", - model=model, - ) - elif custom_llm_provider == "vllm": - if hasattr(original_exception, "status_code"): - if original_exception.status_code == 0: - exception_mapping_worked = True - raise APIConnectionError( - message=f"VLLMException - {original_exception.message}", - llm_provider="vllm", - model=model, - request=original_exception.request, - ) - elif custom_llm_provider == "azure": - if "Internal server error" in error_str: - exception_mapping_worked = True - raise APIError( - status_code=500, - message=f"AzureException - {original_exception.message}", - llm_provider="azure", - model=model, - litellm_debug_info=extra_information, - request=httpx.Request(method="POST", url="https://openai.com/"), - ) - elif "This model's maximum context length is" in error_str: - exception_mapping_worked = True - raise ContextWindowExceededError( - message=f"AzureException - {original_exception.message}", - llm_provider="azure", - model=model, - litellm_debug_info=extra_information, - response=original_exception.response, - ) - elif "DeploymentNotFound" in error_str: - exception_mapping_worked = True - raise NotFoundError( - message=f"AzureException - {original_exception.message}", - llm_provider="azure", - model=model, - litellm_debug_info=extra_information, - response=original_exception.response, - ) - elif ( - "invalid_request_error" in error_str - and "content_policy_violation" in error_str - ) or ( - "The response was filtered due to the prompt triggering Azure OpenAI's content management" - in error_str - ): - exception_mapping_worked = True - raise ContentPolicyViolationError( - message=f"AzureException - {original_exception.message}", - llm_provider="azure", - model=model, - litellm_debug_info=extra_information, - response=original_exception.response, - ) - elif "invalid_request_error" in error_str: - exception_mapping_worked = True - raise BadRequestError( - message=f"AzureException - {original_exception.message}", - llm_provider="azure", - model=model, - litellm_debug_info=extra_information, - response=original_exception.response, - ) - elif ( - "The api_key client option must be set either by passing api_key to the client or by setting" - in error_str - ): - exception_mapping_worked = True - raise AuthenticationError( - message=f"{exception_provider} - {original_exception.message}", - llm_provider=custom_llm_provider, - model=model, - litellm_debug_info=extra_information, - response=original_exception.response, - ) - elif hasattr(original_exception, "status_code"): - exception_mapping_worked = True - if original_exception.status_code == 401: - exception_mapping_worked = True - raise AuthenticationError( - message=f"AzureException - {original_exception.message}", - llm_provider="azure", - model=model, - litellm_debug_info=extra_information, - response=original_exception.response, - ) - elif original_exception.status_code == 408: - exception_mapping_worked = True - raise Timeout( - message=f"AzureException - {original_exception.message}", - model=model, - litellm_debug_info=extra_information, - llm_provider="azure", - ) - if original_exception.status_code == 422: - exception_mapping_worked = True - raise BadRequestError( - message=f"AzureException - {original_exception.message}", - model=model, - llm_provider="azure", - litellm_debug_info=extra_information, - response=original_exception.response, - ) - elif original_exception.status_code == 429: - exception_mapping_worked = True - raise RateLimitError( - message=f"AzureException - {original_exception.message}", - model=model, - llm_provider="azure", - litellm_debug_info=extra_information, - response=original_exception.response, - ) - elif original_exception.status_code == 503: - exception_mapping_worked = True - raise ServiceUnavailableError( - message=f"AzureException - {original_exception.message}", - model=model, - llm_provider="azure", - litellm_debug_info=extra_information, - response=original_exception.response, - ) - elif original_exception.status_code == 504: # gateway timeout error - exception_mapping_worked = True - raise Timeout( - message=f"AzureException - {original_exception.message}", - model=model, - litellm_debug_info=extra_information, - llm_provider="azure", - ) - else: - exception_mapping_worked = True - raise APIError( - status_code=original_exception.status_code, - message=f"AzureException - {original_exception.message}", - llm_provider="azure", - litellm_debug_info=extra_information, - model=model, - request=httpx.Request( - method="POST", url="https://openai.com/" - ), - ) - else: - # if no status code then it is an APIConnectionError: https://github.com/openai/openai-python#handling-errors - raise APIConnectionError( - message=f"{exception_provider} - {message}", - llm_provider="azure", - model=model, - litellm_debug_info=extra_information, - request=httpx.Request(method="POST", url="https://openai.com/"), - ) - if ( - "BadRequestError.__init__() missing 1 required positional argument: 'param'" - in str(original_exception) - ): # deal with edge-case invalid request error bug in openai-python sdk - exception_mapping_worked = True - raise BadRequestError( - message=f"{exception_provider}: This can happen due to missing AZURE_API_VERSION: {str(original_exception)}", - model=model, - llm_provider=custom_llm_provider, - response=original_exception.response, - ) - else: # ensure generic errors always return APIConnectionError= - exception_mapping_worked = True - if hasattr(original_exception, "request"): - raise APIConnectionError( - message=f"{str(original_exception)}", - llm_provider=custom_llm_provider, - model=model, - request=original_exception.request, - ) - else: -> raise APIConnectionError( - message=f"{str(original_exception)}", - llm_provider=custom_llm_provider, - model=model, - request=httpx.Request( - method="POST", url="https://api.openai.com/v1/" - ), # stub the request - ) -E litellm.exceptions.APIConnectionError: 'NoneType' object has no attribute 'get' - -../utils.py:9636: APIConnectionError - -During handling of the above exception, another exception occurred: - -sync_mode = False, model = 'bedrock/amazon.titan-tg1-large' - - @pytest.mark.parametrize("sync_mode", [True, False]) - @pytest.mark.parametrize( - "model", - [ - # "bedrock/cohere.command-r-plus-v1:0", - # "anthropic.claude-3-sonnet-20240229-v1:0", - # "anthropic.claude-instant-v1", - # "bedrock/ai21.j2-mid", - # "mistral.mistral-7b-instruct-v0:2", - "bedrock/amazon.titan-tg1-large", - # "meta.llama3-8b-instruct-v1:0", - ], - ) - @pytest.mark.asyncio - async def test_bedrock_httpx_streaming(sync_mode, model): - try: - litellm.set_verbose = True - if sync_mode: - final_chunk: Optional[litellm.ModelResponse] = None - response: litellm.CustomStreamWrapper = completion( # type: ignore - model=model, - messages=messages, - max_tokens=10, # type: ignore - stream=True, - ) - complete_response = "" - # Add any assertions here to check the response - has_finish_reason = False - for idx, chunk in enumerate(response): - final_chunk = chunk - chunk, finished = streaming_format_tests(idx, chunk) - if finished: - has_finish_reason = True - break - complete_response += chunk - if has_finish_reason == False: - raise Exception("finish reason not set") - if complete_response.strip() == "": - raise Exception("Empty response received") - else: - response: litellm.CustomStreamWrapper = await litellm.acompletion( # type: ignore - model=model, - messages=messages, - max_tokens=100, # type: ignore - stream=True, - ) - complete_response = "" - # Add any assertions here to check the response - has_finish_reason = False - idx = 0 - final_chunk: Optional[litellm.ModelResponse] = None - async for chunk in response: - final_chunk = chunk - chunk, finished = streaming_format_tests(idx, chunk) - if finished: - has_finish_reason = True - break - complete_response += chunk - idx += 1 - if has_finish_reason == False: - raise Exception("finish reason not set") - if complete_response.strip() == "": - raise Exception("Empty response received") - print(f"completion_response: {complete_response}\n\nFinalChunk: {final_chunk}") - except RateLimitError: - pass - except Exception as e: -> pytest.fail(f"Error occurred: {e}") -E Failed: Error occurred: 'NoneType' object has no attribute 'get' - -test_streaming.py:1110: Failed ----------------------------- Captured stdout setup ----------------------------- - ------------------------------ Captured stdout call ----------------------------- +test_streaming.py .Logging Details LiteLLM-Async Success Call +Goes into checking if chunk has hiddden created at param +Chunks have a created at hidden param +Chunks sorted +token_counter messages received: [{'content': 'Hello, how are you?', 'role': 'user'}] +Token Counter - using generic token counter, for model=cohere.command-text-v14 +LiteLLM: Utils - Counting tokens for OpenAI model=gpt-3.5-turbo +.Token Counter - using generic token counter, for model=cohere.command-text-v14 +LiteLLM: Utils - Counting tokens for OpenAI model=gpt-3.5-turbo +Async success callbacks: Got a complete streaming response +Looking up model=cohere.command-text-v14 in model_cost_map +Success: model=cohere.command-text-v14 in model_cost_map +prompt_tokens=13; completion_tokens=10 +Returned custom cost for model=cohere.command-text-v14 - prompt_tokens_cost_usd_dollar: 1.95e-05, completion_tokens_cost_usd_dollar: 1.9999999999999998e-05 +final cost: 3.95e-05; prompt_tokens_cost_usd_dollar: 1.95e-05; completion_tokens_cost_usd_dollar: 1.9999999999999998e-05 + [100%]Logging Details LiteLLM-Success Call: None +success callbacks: [] +Goes into checking if chunk has hiddden created at param +Chunks have a created at hidden param +Chunks sorted +token_counter messages received: [{'content': 'Hello, how are you?', 'role': 'user'}] +Token Counter - using generic token counter, for model=cohere.command-text-v14 +LiteLLM: Utils - Counting tokens for OpenAI model=gpt-3.5-turbo +Token Counter - using generic token counter, for model=cohere.command-text-v14 +LiteLLM: Utils - Counting tokens for OpenAI model=gpt-3.5-turbo +Logging Details LiteLLM-Success Call streaming complete +Looking up model=cohere.command-text-v14 in model_cost_map +Success: model=cohere.command-text-v14 in model_cost_map +prompt_tokens=13; completion_tokens=10 +Returned custom cost for model=cohere.command-text-v14 - prompt_tokens_cost_usd_dollar: 1.95e-05, completion_tokens_cost_usd_dollar: 1.9999999999999998e-05 +final cost: 3.95e-05; prompt_tokens_cost_usd_dollar: 1.95e-05; completion_tokens_cost_usd_dollar: 1.9999999999999998e-05 -Request to litellm: -litellm.acompletion(model='bedrock/amazon.titan-tg1-large', messages=[{'content': 'Hello, how are you?', 'role': 'user'}], max_tokens=100, stream=True) - - -self.optional_params: {} -ASYNC kwargs[caching]: False; litellm.cache: None; kwargs.get('cache'): None -Final returned optional params: {'maxTokenCount': 100, 'stream': True} -self.optional_params: {'maxTokenCount': 100, 'stream': True} - - -POST Request Sent from LiteLLM: -curl -X POST \ -https://bedrock-runtime.us-west-2.amazonaws.com/model/amazon.titan-tg1-large/invoke-with-response-stream \ --H 'Content-Type: application/json' -H 'X-Amz-Date: 20240517T053236Z' -H 'Authorization: AWS4-HMAC-SHA256 Credential=AKIA45ZGR4NCKSABWA6O/20240517/us-west-2/bedrock/aws4_request, SignedHeaders=content-type;host;x-amz-date, Signature=128337479260a5d917f2dd0656a6d57d1662a6c8819f********************' -H 'Content-Length: 84' \ --d '{"inputText": "Hello, how are you?", "textGenerationConfig": {"maxTokenCount": 100}}' - - -value of async chunk: {'text': '\nHello, I am an AI model developed by Amazon Titan Foundation Models. I have been trained on vast amounts of data, making me capable of understanding and generating human-like text. My development has been focused on continuously improving my pe', 'is_finished': False, 'finish_reason': ''} -PROCESSED ASYNC CHUNK PRE CHUNK CREATOR: {'text': '\nHello, I am an AI model developed by Amazon Titan Foundation Models. I have been trained on vast amounts of data, making me capable of understanding and generating human-like text. My development has been focused on continuously improving my pe', 'is_finished': False, 'finish_reason': ''} - -Give Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new -LiteLLM.Info: If you need to debug this error, use `litellm.set_verbose=True'. - - -Provider List: https://docs.litellm.ai/docs/providers - -Logging Details: logger_fn - None | callable(logger_fn) - False =============================== warnings summary =============================== ../../../../../../opt/homebrew/lib/python3.11/site-packages/pydantic/_internal/_config.py:284: 25 warnings /opt/homebrew/lib/python3.11/site-packages/pydantic/_internal/_config.py:284: PydanticDeprecatedSince20: Support for class-based `config` is deprecated, use ConfigDict instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.7/migration/ @@ -4123,12 +99,10 @@ Logging Details: logger_fn - None | callable(logger_fn) - False /Users/krrishdholakia/Documents/litellm/litellm/utils.py:60: DeprecationWarning: open_text is deprecated. Use files() instead. Refer to https://importlib-resources.readthedocs.io/en/latest/using.html#migrating-from-legacy for migration advice. with resources.open_text("litellm.llms.tokenizers", "anthropic_tokenizer.json") as f: -test_streaming.py::test_bedrock_httpx_streaming[bedrock/amazon.titan-tg1-large-False] +test_streaming.py::test_bedrock_httpx_streaming[cohere.command-text-v14-False] +test_streaming.py::test_bedrock_httpx_streaming[cohere.command-text-v14-True] /opt/homebrew/lib/python3.11/site-packages/httpx/_content.py:204: DeprecationWarning: Use 'content=<...>' to upload raw bytes/text content. warnings.warn(message, DeprecationWarning) -- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html -=========================== short test summary info ============================ -FAILED test_streaming.py::test_bedrock_httpx_streaming[bedrock/amazon.titan-tg1-large-False] -!!!!!!!!!!!!!!!!!!!!!!!!!! stopping after 1 failures !!!!!!!!!!!!!!!!!!!!!!!!!!! -======================== 1 failed, 40 warnings in 3.56s ======================== +======================== 2 passed, 41 warnings in 4.94s ======================== diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py index 68143f9ac5..57fb6d33ee 100644 --- a/litellm/tests/test_completion.py +++ b/litellm/tests/test_completion.py @@ -2673,6 +2673,7 @@ def response_format_tests(response: litellm.ModelResponse): "mistral.mistral-7b-instruct-v0:2", "bedrock/amazon.titan-tg1-large", "meta.llama3-8b-instruct-v1:0", + "cohere.command-text-v14", ], ) @pytest.mark.asyncio diff --git a/litellm/tests/test_streaming.py b/litellm/tests/test_streaming.py index 59f435a7ea..580adcba23 100644 --- a/litellm/tests/test_streaming.py +++ b/litellm/tests/test_streaming.py @@ -1044,13 +1044,14 @@ async def test_completion_replicate_llama3_streaming(sync_mode): @pytest.mark.parametrize( "model", [ - "bedrock/cohere.command-r-plus-v1:0", - "anthropic.claude-3-sonnet-20240229-v1:0", - "anthropic.claude-instant-v1", - "bedrock/ai21.j2-mid", - "mistral.mistral-7b-instruct-v0:2", - "bedrock/amazon.titan-tg1-large", - "meta.llama3-8b-instruct-v1:0", + # "bedrock/cohere.command-r-plus-v1:0", + # "anthropic.claude-3-sonnet-20240229-v1:0", + # "anthropic.claude-instant-v1", + # "bedrock/ai21.j2-mid", + # "mistral.mistral-7b-instruct-v0:2", + # "bedrock/amazon.titan-tg1-large", + # "meta.llama3-8b-instruct-v1:0", + "cohere.command-text-v14" ], ) @pytest.mark.asyncio