diff --git a/litellm/__init__.py b/litellm/__init__.py index db41a0fc5..b84ad602e 100644 --- a/litellm/__init__.py +++ b/litellm/__init__.py @@ -375,6 +375,7 @@ from .integrations import * from .exceptions import ( AuthenticationError, InvalidRequestError, + BadRequestError, RateLimitError, ServiceUnavailableError, OpenAIError, diff --git a/litellm/exceptions.py b/litellm/exceptions.py index 9567fe799..03240fc6d 100644 --- a/litellm/exceptions.py +++ b/litellm/exceptions.py @@ -8,75 +8,82 @@ # Thank you users! We ❤️ you! - Krrish & Ishaan ## LiteLLM versions of the OpenAI Exception Types -from openai.error import ( + +from openai import ( AuthenticationError, - InvalidRequestError, + BadRequestError, RateLimitError, - ServiceUnavailableError, + APIStatusError, OpenAIError, APIError, - Timeout, + APITimeoutError, APIConnectionError, ) - +import httpx class AuthenticationError(AuthenticationError): # type: ignore - def __init__(self, message, llm_provider, model): + def __init__(self, message, llm_provider, model, response: httpx.Response): self.status_code = 401 self.message = message self.llm_provider = llm_provider self.model = model super().__init__( - self.message + self.message, + response=response, + body=None ) # Call the base class constructor with the parameters it needs - -class InvalidRequestError(InvalidRequestError): # type: ignore - def __init__(self, message, model, llm_provider): +class BadRequestError(BadRequestError): # type: ignore + def __init__(self, message, model, llm_provider, response: httpx.Response): self.status_code = 400 self.message = message self.model = model self.llm_provider = llm_provider super().__init__( - self.message, f"{self.model}" + self.message, + response=response, + body=None ) # Call the base class constructor with the parameters it needs -class Timeout(Timeout): # type: ignore - def __init__(self, message, model, llm_provider): +class Timeout(APITimeoutError): # type: ignore + def __init__(self, message, model, llm_provider, request: httpx.Request): self.status_code = 408 self.message = message self.model = model self.llm_provider = llm_provider super().__init__( - self.message, f"{self.model}" + request=request ) # Call the base class constructor with the parameters it needs -# sub class of invalid request error - meant to give more granularity for error handling context window exceeded errors -class ContextWindowExceededError(InvalidRequestError): # type: ignore - def __init__(self, message, model, llm_provider): - self.status_code = 400 - self.message = message - self.model = model - self.llm_provider = llm_provider - super().__init__( - self.message, self.model, self.llm_provider - ) # Call the base class constructor with the parameters it needs - - class RateLimitError(RateLimitError): # type: ignore - def __init__(self, message, llm_provider, model): + def __init__(self, message, llm_provider, model, response: httpx.Response): self.status_code = 429 self.message = message self.llm_provider = llm_provider self.modle = model super().__init__( - self.message + self.message, + response=response, + body=None ) # Call the base class constructor with the parameters it needs +# sub class of rate limit error - meant to give more granularity for error handling context window exceeded errors +class ContextWindowExceededError(BadRequestError): # type: ignore + def __init__(self, message, model, llm_provider, response: httpx.Response): + self.status_code = 400 + self.message = message + self.model = model + self.llm_provider = llm_provider + super().__init__( + message=self.message, + model=self.model, + llm_provider=self.llm_provider, + response=response + ) # Call the base class constructor with the parameters it needs -class ServiceUnavailableError(ServiceUnavailableError): # type: ignore +class ServiceUnavailableError(APIStatusError): # type: ignore def __init__(self, message, llm_provider, model): - self.status_code = 500 + self.status_code = 503 self.message = message self.llm_provider = llm_provider self.model = model @@ -87,13 +94,14 @@ class ServiceUnavailableError(ServiceUnavailableError): # type: ignore # raise this when the API returns an invalid response object - https://github.com/openai/openai-python/blob/1be14ee34a0f8e42d3f9aa5451aa4cb161f1781f/openai/api_requestor.py#L401 class APIError(APIError): # type: ignore - def __init__(self, status_code, message, llm_provider, model): + def __init__(self, status_code, message, llm_provider, model, request: httpx.Request): self.status_code = status_code self.message = message self.llm_provider = llm_provider self.model = model super().__init__( - self.message + self.message, + request=request ) # raised if an invalid request (not get, delete, put, post) is made @@ -123,4 +131,15 @@ class BudgetExceededError(Exception): self.current_cost = current_cost self.max_budget = max_budget message = f"Budget has been exceeded! Current cost: {current_cost}, Max budget: {max_budget}" - super().__init__(message) \ No newline at end of file + super().__init__(message) + +## DEPRECATED ## +class InvalidRequestError(BadRequestError): # type: ignore + def __init__(self, message, model, llm_provider): + self.status_code = 400 + self.message = message + self.model = model + self.llm_provider = llm_provider + super().__init__( + self.message, f"{self.model}" + ) # Call the base class constructor with the parameters it needs diff --git a/litellm/llms/base.py b/litellm/llms/base.py index cc03be919..a7710a164 100644 --- a/litellm/llms/base.py +++ b/litellm/llms/base.py @@ -1,16 +1,21 @@ ## This is a template base class to be used for adding new LLM providers via API calls import litellm -import requests, certifi, ssl +import httpx, certifi, ssl class BaseLLM: + _client_session = None def create_client_session(self): if litellm.client_session: - session = litellm.client_session + _client_session = litellm.client_session else: - session = requests.Session() - - return session + _client_session = httpx.Client(timeout=600) + return _client_session + + def __exit__(self): + if hasattr(self, '_client_session'): + self._client_session.close() + def validate_environment(self): # set up the environment required to run the model pass diff --git a/litellm/llms/openai.py b/litellm/llms/openai.py index 3eaaea450..fdc6de8fa 100644 --- a/litellm/llms/openai.py +++ b/litellm/llms/openai.py @@ -1,14 +1,17 @@ from typing import Optional, Union -import types, requests +import types +import httpx from .base import BaseLLM from litellm.utils import ModelResponse, Choices, Message, CustomStreamWrapper, convert_to_model_response_object from typing import Callable, Optional import aiohttp class OpenAIError(Exception): - def __init__(self, status_code, message): + def __init__(self, status_code, message, request: httpx.Request, response: httpx.Response): self.status_code = status_code self.message = message + self.request = request + self.response = response super().__init__( self.message ) # Call the base class constructor with the parameters it needs @@ -144,7 +147,7 @@ class OpenAITextCompletionConfig(): and v is not None} class OpenAIChatCompletion(BaseLLM): - _client_session: requests.Session + _client_session: httpx.Client def __init__(self) -> None: super().__init__() @@ -200,18 +203,8 @@ class OpenAIChatCompletion(BaseLLM): return self.async_streaming(logging_obj=logging_obj, api_base=api_base, data=data, headers=headers, model_response=model_response, model=model) else: return self.acompletion(api_base=api_base, data=data, headers=headers, model_response=model_response) - elif "stream" in optional_params and optional_params["stream"] == True: - response = self._client_session.post( - url=api_base, - json=data, - headers=headers, - stream=optional_params["stream"] - ) - if response.status_code != 200: - raise OpenAIError(status_code=response.status_code, message=response.text) - - ## RESPONSE OBJECT - return response.iter_lines() + elif optional_params.get("stream", False): + return self.streaming(logging_obj=logging_obj, api_base=api_base, data=data, headers=headers, model_response=model_response, model=model) else: response = self._client_session.post( url=api_base, @@ -219,7 +212,7 @@ class OpenAIChatCompletion(BaseLLM): headers=headers, ) if response.status_code != 200: - raise OpenAIError(status_code=response.status_code, message=response.text) + raise OpenAIError(status_code=response.status_code, message=response.text, request=response.request, response=response) ## RESPONSE OBJECT return convert_to_model_response_object(response_object=response.json(), model_response_object=model_response) @@ -246,41 +239,64 @@ class OpenAIChatCompletion(BaseLLM): exception_mapping_worked = True raise e except Exception as e: - if exception_mapping_worked: - raise e - else: - import traceback - raise OpenAIError(status_code=500, message=traceback.format_exc()) + raise e async def acompletion(self, api_base: str, data: dict, headers: dict, model_response: ModelResponse): - async with aiohttp.ClientSession() as session: - async with session.post(api_base, json=data, headers=headers, ssl=False) as response: - response_json = await response.json() - if response.status != 200: - raise OpenAIError(status_code=response.status, message=response.text) + async with httpx.AsyncClient() as client: + response = await client.post(api_base, json=data, headers=headers) + response_json = response.json() + if response.status != 200: + raise OpenAIError(status_code=response.status, message=response.text) - ## RESPONSE OBJECT - return convert_to_model_response_object(response_object=response_json, model_response_object=model_response) + ## RESPONSE OBJECT + return convert_to_model_response_object(response_object=response_json, model_response_object=model_response) + + def streaming(self, + logging_obj, + api_base: str, + data: dict, + headers: dict, + model_response: ModelResponse, + model: str + ): + with self._client_session.stream( + url=f"{api_base}", + json=data, + headers=headers, + method="POST" + ) as response: + if response.status_code != 200: + raise OpenAIError(status_code=response.status_code, message=response.text(), request=self._client_session.request, response=response) + + completion_stream = response.iter_lines() + streamwrapper = CustomStreamWrapper(completion_stream=completion_stream, model=model, custom_llm_provider="openai",logging_obj=logging_obj) + for transformed_chunk in streamwrapper: + yield transformed_chunk async def async_streaming(self, logging_obj, api_base: str, - data: dict, headers: dict, + data: dict, + headers: dict, model_response: ModelResponse, model: str): - async with aiohttp.ClientSession() as session: - async with session.post(api_base, json=data, headers=headers, ssl=False) as response: - # Check if the request was successful (status code 200) - if response.status != 200: - raise OpenAIError(status_code=response.status, message=await response.text()) - - streamwrapper = CustomStreamWrapper(completion_stream=response, model=model, custom_llm_provider="openai",logging_obj=logging_obj) - async for transformed_chunk in streamwrapper: - yield transformed_chunk + client = httpx.AsyncClient() + async with client.stream( + url=f"{api_base}", + json=data, + headers=headers, + method="POST" + ) as response: + if response.status_code != 200: + raise OpenAIError(status_code=response.status_code, message=response.text(), request=self._client_session.request, response=response) + + streamwrapper = CustomStreamWrapper(completion_stream=response.aiter_lines(), model=model, custom_llm_provider="openai",logging_obj=logging_obj) + async for transformed_chunk in streamwrapper: + yield transformed_chunk def embedding(self, model: str, @@ -349,7 +365,7 @@ class OpenAIChatCompletion(BaseLLM): class OpenAITextCompletion(BaseLLM): - _client_session: requests.Session + _client_session: httpx.Client def __init__(self) -> None: super().__init__() @@ -367,7 +383,7 @@ class OpenAITextCompletion(BaseLLM): try: ## RESPONSE OBJECT if response_object is None or model_response_object is None: - raise OpenAIError(status_code=500, message="Error in response object format") + raise ValueError(message="Error in response object format") choice_list=[] for idx, choice in enumerate(response_object["choices"]): message = Message(content=choice["text"], role="assistant") @@ -386,8 +402,8 @@ class OpenAITextCompletion(BaseLLM): model_response_object._hidden_params["original_response"] = response_object # track original response, if users make a litellm.text_completion() request, we can return the original response return model_response_object - except: - OpenAIError(status_code=500, message="Invalid response object.") + except Exception as e: + raise e def completion(self, model: Optional[str]=None, @@ -397,6 +413,7 @@ class OpenAITextCompletion(BaseLLM): api_key: Optional[str]=None, api_base: Optional[str]=None, logging_obj=None, + acompletion: bool = False, optional_params=None, litellm_params=None, logger_fn=None, @@ -412,9 +429,6 @@ class OpenAITextCompletion(BaseLLM): api_base = f"{api_base}/completions" if len(messages)>0 and "content" in messages[0] and type(messages[0]["content"]) == list: - # Note: internal logic - for enabling litellm.text_completion() - # text-davinci-003 can accept a string or array, if it's an array, assume the array is set in messages[0]['content'] - # https://platform.openai.com/docs/api-reference/completions/create prompt = messages[0]["content"] else: prompt = " ".join([message["content"] for message in messages]) # type: ignore @@ -431,19 +445,13 @@ class OpenAITextCompletion(BaseLLM): api_key=api_key, additional_args={"headers": headers, "api_base": api_base, "data": data}, ) - - if "stream" in optional_params and optional_params["stream"] == True: - response = self._client_session.post( - url=f"{api_base}", - json=data, - headers=headers, - stream=optional_params["stream"] - ) - if response.status_code != 200: - raise OpenAIError(status_code=response.status_code, message=response.text) - - ## RESPONSE OBJECT - return response.iter_lines() + if acompletion == True: + if optional_params.get("stream", False): + return self.async_streaming(logging_obj=logging_obj, api_base=api_base, data=data, headers=headers, model_response=model_response, model=model) + else: + return self.acompletion(api_base=api_base, data=data, headers=headers, model_response=model_response, prompt=prompt, api_key=api_key, logging_obj=logging_obj, model=model) + elif optional_params.get("stream", False): + return self.streaming(logging_obj=logging_obj, api_base=api_base, data=data, headers=headers, model_response=model_response, model=model) else: response = self._client_session.post( url=f"{api_base}", @@ -451,7 +459,7 @@ class OpenAITextCompletion(BaseLLM): headers=headers, ) if response.status_code != 200: - raise OpenAIError(status_code=response.status_code, message=response.text) + raise OpenAIError(status_code=response.status_code, message=response.text, request=self._client_session.request, response=response) ## LOGGING logging_obj.post_call( @@ -466,12 +474,76 @@ class OpenAITextCompletion(BaseLLM): ## RESPONSE OBJECT return self.convert_to_model_response_object(response_object=response.json(), model_response_object=model_response) - except OpenAIError as e: - exception_mapping_worked = True - raise e except Exception as e: - if exception_mapping_worked: - raise e - else: - import traceback - raise OpenAIError(status_code=500, message=traceback.format_exc()) + raise e + + async def acompletion(self, + logging_obj, + api_base: str, + data: dict, + headers: dict, + model_response: ModelResponse, + prompt: str, + api_key: str, + model: str): + async with httpx.AsyncClient() as client: + response = await client.post(api_base, json=data, headers=headers) + response_json = response.json() + if response.status_code != 200: + raise OpenAIError(status_code=response.status_code, message=response.text) + + ## LOGGING + logging_obj.post_call( + input=prompt, + api_key=api_key, + original_response=response, + additional_args={ + "headers": headers, + "api_base": api_base, + }, + ) + + ## RESPONSE OBJECT + return self.convert_to_model_response_object(response_object=response_json, model_response_object=model_response) + + def streaming(self, + logging_obj, + api_base: str, + data: dict, + headers: dict, + model_response: ModelResponse, + model: str + ): + with self._client_session.stream( + url=f"{api_base}", + json=data, + headers=headers, + method="POST" + ) as response: + if response.status_code != 200: + raise OpenAIError(status_code=response.status_code, message=response.text(), request=self._client_session.request, response=response) + + streamwrapper = CustomStreamWrapper(completion_stream=response.iter_lines(), model=model, custom_llm_provider="text-completion-openai",logging_obj=logging_obj) + for transformed_chunk in streamwrapper: + yield transformed_chunk + + async def async_streaming(self, + logging_obj, + api_base: str, + data: dict, + headers: dict, + model_response: ModelResponse, + model: str): + client = httpx.AsyncClient() + async with client.stream( + url=f"{api_base}", + json=data, + headers=headers, + method="POST" + ) as response: + if response.status_code != 200: + raise OpenAIError(status_code=response.status_code, message=response.text(), request=self._client_session.request, response=response) + + streamwrapper = CustomStreamWrapper(completion_stream=response.aiter_lines(), model=model, custom_llm_provider="text-completion-openai",logging_obj=logging_obj) + async for transformed_chunk in streamwrapper: + yield transformed_chunk \ No newline at end of file diff --git a/litellm/main.py b/litellm/main.py index c7c421783..155012a35 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -138,8 +138,10 @@ async def acompletion(*args, **kwargs): _, custom_llm_provider, _, _ = get_llm_provider(model=model, api_base=kwargs.get("api_base", None)) - - if (custom_llm_provider == "openai" or custom_llm_provider == "azure" or custom_llm_provider == "custom_openai"): # currently implemented aiohttp calls for just azure and openai, soon all. + if (custom_llm_provider == "openai" + or custom_llm_provider == "azure" + or custom_llm_provider == "custom_openai" + or custom_llm_provider == "text-completion-openai"): # currently implemented aiohttp calls for just azure and openai, soon all. if kwargs.get("stream", False): response = completion(*args, **kwargs) else: @@ -161,7 +163,7 @@ async def acompletion(*args, **kwargs): line async for line in response ) - else: + else: end_time = datetime.datetime.now() # [OPTIONAL] ADD TO CACHE if litellm.caching or litellm.caching_with_models or litellm.cache != None: # user init a cache object @@ -596,15 +598,16 @@ def completion( print_verbose=print_verbose, api_key=api_key, api_base=api_base, + acompletion=acompletion, logging_obj=logging, optional_params=optional_params, litellm_params=litellm_params, logger_fn=logger_fn ) - if "stream" in optional_params and optional_params["stream"] == True: - response = CustomStreamWrapper(model_response, model, custom_llm_provider="text-completion-openai", logging_obj=logging) - return response + # if "stream" in optional_params and optional_params["stream"] == True: + # response = CustomStreamWrapper(model_response, model, custom_llm_provider="text-completion-openai", logging_obj=logging) + # return response response = model_response elif ( "replicate" in model or diff --git a/litellm/tests/test_async_fn.py b/litellm/tests/test_async_fn.py index 5d9d44b20..ba1a375c8 100644 --- a/litellm/tests/test_async_fn.py +++ b/litellm/tests/test_async_fn.py @@ -28,15 +28,13 @@ def test_async_response(): user_message = "Hello, how are you?" messages = [{"content": user_message, "role": "user"}] try: - response = await acompletion(model="gpt-3.5-turbo", messages=messages) - print(f"response: {response}") - response = await acompletion(model="azure/chatgpt-v-2", messages=messages) + response = await acompletion(model="gpt-3.5-turbo-instruct", messages=messages) print(f"response: {response}") except Exception as e: pytest.fail(f"An exception occurred: {e}") response = asyncio.run(test_get_response()) -# print(response) + print(response) # test_async_response() def test_get_response_streaming(): @@ -45,8 +43,7 @@ def test_get_response_streaming(): user_message = "write a short poem in one sentence" messages = [{"content": user_message, "role": "user"}] try: - response = await acompletion(model="azure/chatgpt-v-2", messages=messages, stream=True) - # response = await acompletion(model="gpt-3.5-turbo", messages=messages, stream=True) + response = await acompletion(model="gpt-3.5-turbo-instruct", messages=messages, stream=True) print(type(response)) import inspect @@ -59,18 +56,17 @@ def test_get_response_streaming(): async for chunk in response: token = chunk["choices"][0]["delta"].get("content", "") output += token - print(f"output: {output}") assert output is not None, "output cannot be None." assert isinstance(output, str), "output needs to be of type str" - assert len(output) > 0, f"Length of output needs to be greater than 0. {output}" - + assert len(output) > 0, "Length of output needs to be greater than 0." + print(f'output: {output}') except Exception as e: pytest.fail(f"An exception occurred: {e}") return response asyncio.run(test_async_call()) -test_get_response_streaming() +# test_get_response_streaming() def test_get_response_non_openai_streaming(): import asyncio diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py index 2e1c35261..ad48a09cc 100644 --- a/litellm/tests/test_completion.py +++ b/litellm/tests/test_completion.py @@ -9,7 +9,7 @@ sys.path.insert( 0, os.path.abspath("../..") ) # Adds the parent directory to the system path import pytest -from openai.error import Timeout +from openai import Timeout import litellm from litellm import embedding, completion, completion_cost from litellm import RateLimitError @@ -405,7 +405,6 @@ def test_completion_openai(): litellm.api_key = os.environ['OPENAI_API_KEY'] response = completion(model="gpt-3.5-turbo", messages=messages, max_tokens=10, request_timeout=10) print("This is the response object\n", response) - print("\n\nThis is response ms:", response.response_ms) response_str = response["choices"][0]["message"]["content"] @@ -422,14 +421,15 @@ def test_completion_openai(): pass except Exception as e: pytest.fail(f"Error occurred: {e}") -# test_completion_openai() +test_completion_openai() def test_completion_text_openai(): try: - litellm.set_verbose = True + # litellm.set_verbose = True response = completion(model="gpt-3.5-turbo-instruct", messages=messages) - print(response) + print(response["choices"][0]["message"]["content"]) except Exception as e: + print(e) pytest.fail(f"Error occurred: {e}") # test_completion_text_openai() diff --git a/litellm/tests/test_completion_with_retries.py b/litellm/tests/test_completion_with_retries.py index 30ec9ec8c..d5d837e95 100644 --- a/litellm/tests/test_completion_with_retries.py +++ b/litellm/tests/test_completion_with_retries.py @@ -14,7 +14,7 @@ import litellm from litellm import completion_with_retries, completion from litellm import ( AuthenticationError, - InvalidRequestError, + BadRequestError, RateLimitError, ServiceUnavailableError, OpenAIError, diff --git a/litellm/tests/test_exceptions.py b/litellm/tests/test_exceptions.py index bc375c4bf..5a5c22898 100644 --- a/litellm/tests/test_exceptions.py +++ b/litellm/tests/test_exceptions.py @@ -1,4 +1,8 @@ -from openai.error import AuthenticationError, InvalidRequestError, RateLimitError, OpenAIError +try: + from openai import AuthenticationError, BadRequestError, RateLimitError, OpenAIError +except: + from openai.error import AuthenticationError, InvalidRequestError, RateLimitError, OpenAIError + import os import sys import traceback @@ -38,23 +42,24 @@ models = ["command-nightly"] # Test 1: Context Window Errors @pytest.mark.parametrize("model", models) def test_context_window(model): - sample_text = "Say error 50 times" * 100000 + sample_text = "Say error 50 times" * 10000 messages = [{"content": sample_text, "role": "user"}] - print(f"model: {model}") try: - completion(model=model, messages=messages) + response = completion(model=model, messages=messages) + print(f"response: {response}") + print("FAILED!") pytest.fail(f"An exception occurred") - except ContextWindowExceededError: - pass + except ContextWindowExceededError as e: + print(f"Worked!") except RateLimitError: - pass + print("RateLimited!") except Exception as e: print(f"{e}") pytest.fail(f"An error occcurred - {e}") @pytest.mark.parametrize("model", models) def test_context_window_with_fallbacks(model): - ctx_window_fallback_dict = {"command-nightly": "claude-2"} + ctx_window_fallback_dict = {"command-nightly": "claude-2", "gpt-3.5-turbo-instruct": "gpt-3.5-turbo-16k"} sample_text = "how does a court case get to the Supreme Court?" * 1000 messages = [{"content": sample_text, "role": "user"}] @@ -62,8 +67,8 @@ def test_context_window_with_fallbacks(model): # for model in litellm.models_by_provider["bedrock"]: # test_context_window(model=model) -# test_context_window(model="gpt-3.5-turbo-instruct") -# test_context_window_with_fallbacks(model="command-nightly") +# test_context_window(model="gpt-3.5-turbo") +# test_context_window_with_fallbacks(model="gpt-3.5-turbo") # Test 2: InvalidAuth Errors @pytest.mark.parametrize("model", models) def invalid_auth(model): # set the model key to an invalid key, depending on the model @@ -158,14 +163,14 @@ def invalid_auth(model): # set the model key to an invalid key, depending on th # for model in litellm.models_by_provider["bedrock"]: # invalid_auth(model=model) -# invalid_auth(model="gpt-3.5-turbo-instruct") +# invalid_auth(model="gpt-3.5-turbo") # Test 3: Invalid Request Error @pytest.mark.parametrize("model", models) def test_invalid_request_error(model): messages = [{"content": "hey, how's it going?", "role": "user"}] - with pytest.raises(InvalidRequestError): + with pytest.raises(BadRequestError): completion(model=model, messages=messages, max_tokens="hello world") # test_invalid_request_error(model="gpt-3.5-turbo") @@ -178,15 +183,16 @@ def test_invalid_request_error(model): # response = completion(model=model, messages=messages) # except RateLimitError: # return True -# except OpenAIError: # is at least an openai error -> in case of random model errors - e.g. overloaded server -# return True +# # except OpenAIError: # is at least an openai error -> in case of random model errors - e.g. overloaded server +# # return True # except Exception as e: # print(f"Uncaught Exception {model}: {type(e).__name__} - {e}") # traceback.print_exc() # pass # return False # # Repeat each model 500 times -# extended_models = [model for model in models for _ in range(250)] +# # extended_models = [model for model in models for _ in range(250)] +# extended_models = ["gpt-3.5-turbo-instruct" for _ in range(250)] # def worker(model): # return test_model_call(model) diff --git a/litellm/tests/test_streaming.py b/litellm/tests/test_streaming.py index c9b3b9a67..316309e04 100644 --- a/litellm/tests/test_streaming.py +++ b/litellm/tests/test_streaming.py @@ -11,7 +11,7 @@ sys.path.insert( from dotenv import load_dotenv load_dotenv() import litellm -from litellm import completion, acompletion, AuthenticationError, InvalidRequestError, RateLimitError +from litellm import completion, acompletion, AuthenticationError, BadRequestError, RateLimitError, ModelResponse litellm.logging = False litellm.set_verbose = False @@ -47,38 +47,17 @@ first_openai_chunk_example = { def validate_first_format(chunk): # write a test to make sure chunk follows the same format as first_openai_chunk_example - assert isinstance(chunk, dict), "Chunk should be a dictionary." - assert "id" in chunk, "Chunk should have an 'id'." + assert isinstance(chunk, ModelResponse), "Chunk should be a dictionary." assert isinstance(chunk['id'], str), "'id' should be a string." - - assert "object" in chunk, "Chunk should have an 'object'." assert isinstance(chunk['object'], str), "'object' should be a string." - - assert "created" in chunk, "Chunk should have a 'created'." assert isinstance(chunk['created'], int), "'created' should be an integer." - - assert "model" in chunk, "Chunk should have a 'model'." assert isinstance(chunk['model'], str), "'model' should be a string." - - assert "choices" in chunk, "Chunk should have 'choices'." assert isinstance(chunk['choices'], list), "'choices' should be a list." for choice in chunk['choices']: - assert isinstance(choice, dict), "Each choice should be a dictionary." - - assert "index" in choice, "Each choice should have 'index'." assert isinstance(choice['index'], int), "'index' should be an integer." - - assert "delta" in choice, "Each choice should have 'delta'." - assert isinstance(choice['delta'], dict), "'delta' should be a dictionary." - - assert "role" in choice['delta'], "'delta' should have a 'role'." assert isinstance(choice['delta']['role'], str), "'role' should be a string." - - assert "content" in choice['delta'], "'delta' should have 'content'." assert isinstance(choice['delta']['content'], str), "'content' should be a string." - - assert "finish_reason" in choice, "Each choice should have 'finish_reason'." assert (choice['finish_reason'] is None) or isinstance(choice['finish_reason'], str), "'finish_reason' should be None or a string." second_openai_chunk_example = { @@ -98,35 +77,16 @@ second_openai_chunk_example = { } def validate_second_format(chunk): - assert isinstance(chunk, dict), "Chunk should be a dictionary." - assert "id" in chunk, "Chunk should have an 'id'." + assert isinstance(chunk, ModelResponse), "Chunk should be a dictionary." assert isinstance(chunk['id'], str), "'id' should be a string." - - assert "object" in chunk, "Chunk should have an 'object'." assert isinstance(chunk['object'], str), "'object' should be a string." - - assert "created" in chunk, "Chunk should have a 'created'." assert isinstance(chunk['created'], int), "'created' should be an integer." - - assert "model" in chunk, "Chunk should have a 'model'." assert isinstance(chunk['model'], str), "'model' should be a string." - - assert "choices" in chunk, "Chunk should have 'choices'." assert isinstance(chunk['choices'], list), "'choices' should be a list." for choice in chunk['choices']: - assert isinstance(choice, dict), "Each choice should be a dictionary." - - assert "index" in choice, "Each choice should have 'index'." assert isinstance(choice['index'], int), "'index' should be an integer." - - assert "delta" in choice, "Each choice should have 'delta'." - assert isinstance(choice['delta'], dict), "'delta' should be a dictionary." - - assert "content" in choice['delta'], "'delta' should have 'content'." assert isinstance(choice['delta']['content'], str), "'content' should be a string." - - assert "finish_reason" in choice, "Each choice should have 'finish_reason'." assert (choice['finish_reason'] is None) or isinstance(choice['finish_reason'], str), "'finish_reason' should be None or a string." last_openai_chunk_example = { @@ -144,32 +104,15 @@ last_openai_chunk_example = { } def validate_last_format(chunk): - assert isinstance(chunk, dict), "Chunk should be a dictionary." - assert "id" in chunk, "Chunk should have an 'id'." + assert isinstance(chunk, ModelResponse), "Chunk should be a dictionary." assert isinstance(chunk['id'], str), "'id' should be a string." - - assert "object" in chunk, "Chunk should have an 'object'." assert isinstance(chunk['object'], str), "'object' should be a string." - - assert "created" in chunk, "Chunk should have a 'created'." assert isinstance(chunk['created'], int), "'created' should be an integer." - - assert "model" in chunk, "Chunk should have a 'model'." assert isinstance(chunk['model'], str), "'model' should be a string." - - assert "choices" in chunk, "Chunk should have 'choices'." assert isinstance(chunk['choices'], list), "'choices' should be a list." for choice in chunk['choices']: - assert isinstance(choice, dict), "Each choice should be a dictionary." - - assert "index" in choice, "Each choice should have 'index'." assert isinstance(choice['index'], int), "'index' should be an integer." - - assert "delta" in choice, "Each choice should have 'delta'." - assert isinstance(choice['delta'], dict), "'delta' should be a dictionary." - - assert "finish_reason" in choice, "Each choice should have 'finish_reason'." assert isinstance(choice['finish_reason'], str), "'finish_reason' should be a string." def streaming_format_tests(idx, chunk): @@ -188,6 +131,7 @@ def streaming_format_tests(idx, chunk): if chunk["choices"][0]["finish_reason"]: # ensure finish reason is only in last chunk validate_last_format(chunk=chunk) finished = True + print(f"chunk choices: {chunk['choices'][0]['delta']['content']}") if "content" in chunk["choices"][0]["delta"]: extracted_chunk = chunk["choices"][0]["delta"]["content"] print(f"extracted chunk: {extracted_chunk}") @@ -549,7 +493,7 @@ def test_completion_claude_stream_bad_key(): pytest.fail(f"Error occurred: {e}") -test_completion_claude_stream_bad_key() +# test_completion_claude_stream_bad_key() # test_completion_replicate_stream() # def test_completion_vertexai_stream(): @@ -824,7 +768,7 @@ def ai21_completion_call_bad_key(): if complete_response.strip() == "": raise Exception("Empty response received") print(f"completion_response: {complete_response}") - except InvalidRequestError as e: + except Bad as e: pass except: pytest.fail(f"error occurred: {traceback.format_exc()}") @@ -885,7 +829,7 @@ def ai21_completion_call_bad_key(): # test on openai completion call def test_openai_chat_completion_call(): try: - litellm.set_verbose = True + litellm.set_verbose = False response = completion( model="gpt-3.5-turbo", messages=messages, stream=True ) @@ -904,7 +848,7 @@ def test_openai_chat_completion_call(): print(f"error occurred: {traceback.format_exc()}") pass -# test_openai_chat_completion_call() +test_openai_chat_completion_call() def test_openai_chat_completion_complete_response_call(): try: @@ -928,6 +872,7 @@ def test_openai_text_completion_call(): start_time = time.time() for idx, chunk in enumerate(response): chunk, finished = streaming_format_tests(idx, chunk) + print(f"chunk: {chunk}") complete_response += chunk if finished: break @@ -939,6 +884,8 @@ def test_openai_text_completion_call(): print(f"error occurred: {traceback.format_exc()}") pass +# test_openai_text_completion_call() + # # test on together ai completion call - starcoder def test_together_ai_completion_call_starcoder(): try: @@ -992,7 +939,7 @@ def test_together_ai_completion_call_starcoder_bad_key(): if complete_response == "": raise Exception("Empty response received") print(f"complete response: {complete_response}") - except InvalidRequestError as e: + except BadRequestError as e: pass except: print(f"error occurred: {traceback.format_exc()}") diff --git a/litellm/timeout.py b/litellm/timeout.py index a55f31dac..65af51b98 100644 --- a/litellm/timeout.py +++ b/litellm/timeout.py @@ -17,7 +17,10 @@ from concurrent import futures from inspect import iscoroutinefunction from functools import wraps from threading import Thread -from openai.error import Timeout +try: + from openai import Timeout +except: + from openai.error import Timeout def timeout(timeout_duration: float = 0.0, exception_to_raise=Timeout): diff --git a/litellm/utils.py b/litellm/utils.py index 281ad713e..c7a7fb292 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -39,11 +39,15 @@ from .integrations.weights_biases import WeightsBiasesLogger from .integrations.custom_logger import CustomLogger from .integrations.langfuse import LangFuseLogger from .integrations.litedebugger import LiteDebugger -from openai.error import OpenAIError as OriginalError -from openai.openai_object import OpenAIObject +try: + from openai import OpenAIError as OriginalError + from openai._models import BaseModel as OpenAIObject +except: + from openai.error import OpenAIError as OriginalError + from openai.openai_object import OpenAIObject from .exceptions import ( AuthenticationError, - InvalidRequestError, + BadRequestError, RateLimitError, ServiceUnavailableError, OpenAIError, @@ -53,7 +57,7 @@ from .exceptions import ( APIError, BudgetExceededError ) -from typing import cast, List, Dict, Union, Optional +from typing import cast, List, Dict, Union, Optional, Literal from .caching import Cache ####### ENVIRONMENT VARIABLES #################### @@ -127,13 +131,41 @@ class Message(OpenAIObject): if function_call: self.function_call = function_call + def get(self, key, default=None): + # Custom .get() method to access attributes with a default value if the attribute doesn't exist + return getattr(self, key, default) + + def __getitem__(self, key): + # Allow dictionary-style access to attributes + return getattr(self, key) + + def __setitem__(self, key, value): + # Allow dictionary-style assignment of attributes + setattr(self, key, value) + class Delta(OpenAIObject): - def __init__(self, content=None, logprobs=None, role=None, **params): + def __init__(self, content=None, role=None, **params): super(Delta, self).__init__(**params) if content is not None: self.content = content if role: self.role = role + + def __contains__(self, key): + # Define custom behavior for the 'in' operator + return hasattr(self, key) + + def get(self, key, default=None): + # Custom .get() method to access attributes with a default value if the attribute doesn't exist + return getattr(self, key, default) + + def __getitem__(self, key): + # Allow dictionary-style access to attributes + return getattr(self, key) + + def __setitem__(self, key, value): + # Allow dictionary-style assignment of attributes + setattr(self, key, value) class Choices(OpenAIObject): @@ -148,6 +180,22 @@ class Choices(OpenAIObject): self.message = Message(content=None) else: self.message = message + + def __contains__(self, key): + # Define custom behavior for the 'in' operator + return hasattr(self, key) + + def get(self, key, default=None): + # Custom .get() method to access attributes with a default value if the attribute doesn't exist + return getattr(self, key, default) + + def __getitem__(self, key): + # Allow dictionary-style access to attributes + return getattr(self, key) + + def __setitem__(self, key, value): + # Allow dictionary-style assignment of attributes + setattr(self, key, value) class Usage(OpenAIObject): def __init__(self, prompt_tokens=None, completion_tokens=None, total_tokens=None, **params): @@ -158,6 +206,22 @@ class Usage(OpenAIObject): self.completion_tokens = completion_tokens if total_tokens: self.total_tokens = total_tokens + + def __contains__(self, key): + # Define custom behavior for the 'in' operator + return hasattr(self, key) + + def get(self, key, default=None): + # Custom .get() method to access attributes with a default value if the attribute doesn't exist + return getattr(self, key, default) + + def __getitem__(self, key): + # Allow dictionary-style access to attributes + return getattr(self, key) + + def __setitem__(self, key, value): + # Allow dictionary-style assignment of attributes + setattr(self, key, value) class StreamingChoices(OpenAIObject): def __init__(self, finish_reason=None, index=0, delta: Optional[Delta]=None, **params): @@ -168,47 +232,97 @@ class StreamingChoices(OpenAIObject): self.delta = delta else: self.delta = Delta() + + def __contains__(self, key): + # Define custom behavior for the 'in' operator + return hasattr(self, key) + + def get(self, key, default=None): + # Custom .get() method to access attributes with a default value if the attribute doesn't exist + return getattr(self, key, default) + + def __getitem__(self, key): + # Allow dictionary-style access to attributes + return getattr(self, key) -class ModelResponse(OpenAIObject): - def __init__(self, id=None, choices=None, created=None, model=None, usage=None, stream=False, response_ms=None, **params): + def __setitem__(self, key, value): + # Allow dictionary-style assignment of attributes + setattr(self, key, value) + +class ModelResponse(OpenAIObject): + id: str + """A unique identifier for the completion.""" + + choices: List[Union[Choices, StreamingChoices]] + """The list of completion choices the model generated for the input prompt.""" + + created: int + """The Unix timestamp (in seconds) of when the completion was created.""" + + model: Optional[str] = None + """The model used for completion.""" + + object: str + """The object type, which is always "text_completion" """ + + system_fingerprint: Optional[str] = None + """This fingerprint represents the backend configuration that the model runs with. + + Can be used in conjunction with the `seed` request parameter to understand when + backend changes have been made that might impact determinism. + """ + + usage: Optional[Usage] = None + """Usage statistics for the completion request.""" + + _hidden_params: dict = {} + + def __init__(self, id=None, choices=None, created=None, model=None, object=None, system_fingerprint=None, usage=None, stream=False, response_ms=None, hidden_params=None, **params): if stream: - self.object = "chat.completion.chunk" - self.choices = [StreamingChoices()] + object = "chat.completion.chunk" + choices = [StreamingChoices()] else: if model in litellm.open_ai_embedding_models: - self.object = "embedding" + object = "embedding" else: - self.object = "chat.completion" - self.choices = [Choices()] + object = "chat.completion" + choices = [Choices()] if id is None: - self.id = _generate_id() + id = _generate_id() else: - self.id = id + id = id if created is None: - self.created = int(time.time()) + created = int(time.time()) else: - self.created = created + created = created if response_ms: - self._response_ms = response_ms + _response_ms = response_ms else: - self._response_ms = None - self.model = model + _response_ms = None + model = model if usage: - self.usage = usage + usage = usage else: - self.usage = Usage() - self._hidden_params = {} # used in case users want to access the original model response - super(ModelResponse, self).__init__(**params) - - def to_dict_recursive(self): - d = super().to_dict_recursive() - d["choices"] = [choice.to_dict_recursive() for choice in self.choices] - return d - - def cost(self): - # for non streaming responses - return completion_cost(completion_response=self) + usage = Usage() + if hidden_params: + self._hidden_params = hidden_params + super().__init__(id=id, choices=choices, created=created, model=model, object=object, system_fingerprint=system_fingerprint, usage=usage, **params) + + def __contains__(self, key): + # Define custom behavior for the 'in' operator + return hasattr(self, key) + + def get(self, key, default=None): + # Custom .get() method to access attributes with a default value if the attribute doesn't exist + return getattr(self, key, default) + + def __getitem__(self, key): + # Allow dictionary-style access to attributes + return getattr(self, key) + def __setitem__(self, key, value): + # Allow dictionary-style assignment of attributes + setattr(self, key, value) class EmbeddingResponse(OpenAIObject): def __init__(self, id=None, choices=None, created=None, model=None, usage=None, stream=False, response_ms=None, **params): self.object = "list" @@ -2771,7 +2885,7 @@ def valid_model(model): messages = [{"role": "user", "content": "Hello World"}] litellm.completion(model=model, messages=messages) except: - raise InvalidRequestError(message="", model=model, llm_provider="") + raise BadRequestError(message="", model=model, llm_provider="") def check_valid_key(model: str, api_key: str): """ @@ -2955,7 +3069,7 @@ def exception_type( exception_mapping_worked = True if custom_llm_provider == "openrouter": if original_exception.http_status == 413: - raise InvalidRequestError( + raise BadRequestError( message=str(original_exception), model=model, llm_provider="openrouter" @@ -2975,19 +3089,21 @@ def exception_type( else: exception_type = "" if custom_llm_provider == "openai" or custom_llm_provider == "text-completion-openai": - if "This model's maximum context length is" in error_str: + if "This model's maximum context length is" in error_str or "Request too large" in error_str: exception_mapping_worked = True raise ContextWindowExceededError( message=f"OpenAIException - {original_exception.message}", llm_provider="openai", - model=model + model=model, + response=original_exception.response ) - elif "invalid_request_error" in error_str: + elif "invalid_request_error" in error_str and "Incorrect API key provided" not in error_str: exception_mapping_worked = True - raise InvalidRequestError( + raise BadRequestError( message=f"OpenAIException - {original_exception.message}", llm_provider="openai", - model=model + model=model, + response=original_exception.response ) elif hasattr(original_exception, "status_code"): exception_mapping_worked = True @@ -2996,21 +3112,24 @@ def exception_type( raise AuthenticationError( message=f"OpenAIException - {original_exception.message}", llm_provider="openai", - model=model + model=model, + response=original_exception.response ) elif original_exception.status_code == 408: exception_mapping_worked = True raise Timeout( message=f"OpenAIException - {original_exception.message}", model=model, - llm_provider="openai" + llm_provider="openai", + request=original_exception.request ) if original_exception.status_code == 422: exception_mapping_worked = True - raise InvalidRequestError( + raise BadRequestError( message=f"OpenAIException - {original_exception.message}", model=model, llm_provider="openai", + response=original_exception.response ) elif original_exception.status_code == 429: exception_mapping_worked = True @@ -3018,6 +3137,7 @@ def exception_type( message=f"OpenAIException - {original_exception.message}", model=model, llm_provider="openai", + response=original_exception.response ) else: exception_mapping_worked = True @@ -3025,7 +3145,8 @@ def exception_type( status_code=original_exception.status_code, message=f"OpenAIException - {original_exception.message}", llm_provider="openai", - model=model + model=model, + request=original_exception.request ) elif custom_llm_provider == "anthropic": # one of the anthropics if hasattr(original_exception, "message"): @@ -3034,14 +3155,16 @@ def exception_type( raise ContextWindowExceededError( message=original_exception.message, model=model, - llm_provider="anthropic" + llm_provider="anthropic", + response=original_exception.response ) if "Invalid API Key" in original_exception.message: exception_mapping_worked = True raise AuthenticationError( message=original_exception.message, model=model, - llm_provider="anthropic" + llm_provider="anthropic", + response=original_exception.response ) if hasattr(original_exception, "status_code"): print_verbose(f"status_code: {original_exception.status_code}") @@ -3050,35 +3173,32 @@ def exception_type( raise AuthenticationError( message=f"AnthropicException - {original_exception.message}", llm_provider="anthropic", - model=model + model=model, + response=original_exception.response ) - elif original_exception.status_code == 400: + elif original_exception.status_code == 400 or original_exception.status_code == 413: exception_mapping_worked = True - raise InvalidRequestError( + raise BadRequestError( message=f"AnthropicException - {original_exception.message}", model=model, llm_provider="anthropic", + response=original_exception.response ) elif original_exception.status_code == 408: exception_mapping_worked = True raise Timeout( - message=f"AnthropicException - {original_exception.message}", - model=model, - llm_provider="anthropic" - ) - elif original_exception.status_code == 413: - exception_mapping_worked = True - raise InvalidRequestError( message=f"AnthropicException - {original_exception.message}", model=model, llm_provider="anthropic", + request=original_exception.request ) elif original_exception.status_code == 429: exception_mapping_worked = True raise RateLimitError( message=f"AnthropicException - {original_exception.message}", llm_provider="anthropic", - model=model + model=model, + response=original_exception.response ) elif original_exception.status_code == 500: exception_mapping_worked = True @@ -3093,7 +3213,8 @@ def exception_type( status_code=original_exception.status_code, message=f"AnthropicException - {original_exception.message}", llm_provider="anthropic", - model=model + model=model, + request=original_exception.request ) elif custom_llm_provider == "replicate": if "Incorrect authentication token" in error_str: @@ -3101,7 +3222,8 @@ def exception_type( raise AuthenticationError( message=f"ReplicateException - {error_str}", llm_provider="replicate", - model=model + model=model, + response=original_exception.response ) elif "input is too long" in error_str: exception_mapping_worked = True @@ -3109,20 +3231,23 @@ def exception_type( message=f"ReplicateException - {error_str}", model=model, llm_provider="replicate", + response=original_exception.response ) elif exception_type == "ModelError": exception_mapping_worked = True - raise InvalidRequestError( + raise BadRequestError( message=f"ReplicateException - {error_str}", model=model, llm_provider="replicate", + response=original_exception.response ) elif "Request was throttled" in error_str: exception_mapping_worked = True raise RateLimitError( message=f"ReplicateException - {error_str}", llm_provider="replicate", - model=model + model=model, + response=original_exception.response ) elif hasattr(original_exception, "status_code"): if original_exception.status_code == 401: @@ -3130,35 +3255,32 @@ def exception_type( raise AuthenticationError( message=f"ReplicateException - {original_exception.message}", llm_provider="replicate", - model=model + model=model, + response=original_exception.response ) - elif original_exception.status_code == 400 or original_exception.status_code == 422: + elif original_exception.status_code == 400 or original_exception.status_code == 422 or original_exception.status_code == 413: exception_mapping_worked = True - raise InvalidRequestError( + raise BadRequestError( message=f"ReplicateException - {original_exception.message}", model=model, llm_provider="replicate", + response=original_exception.response ) elif original_exception.status_code == 408: exception_mapping_worked = True raise Timeout( - message=f"ReplicateException - {original_exception.message}", - model=model, - llm_provider="replicate" - ) - elif original_exception.status_code == 413: - exception_mapping_worked = True - raise InvalidRequestError( message=f"ReplicateException - {original_exception.message}", model=model, llm_provider="replicate", + request=original_exception.request ) elif original_exception.status_code == 429: exception_mapping_worked = True raise RateLimitError( message=f"ReplicateException - {original_exception.message}", llm_provider="replicate", - model=model + model=model, + response=original_exception.response ) elif original_exception.status_code == 500: exception_mapping_worked = True @@ -3172,7 +3294,8 @@ def exception_type( status_code=500, message=f"ReplicateException - {str(original_exception)}", llm_provider="replicate", - model=model + model=model, + request=original_exception.request ) elif custom_llm_provider == "bedrock": if "too many tokens" in error_str or "expected maxLength:" in error_str or "Input is too long" in error_str or "Too many input tokens" in error_str: @@ -3180,28 +3303,32 @@ def exception_type( raise ContextWindowExceededError( message=f"BedrockException: Context Window Error - {error_str}", model=model, - llm_provider="bedrock" + llm_provider="bedrock", + response=original_exception.response ) if "Malformed input request" in error_str: exception_mapping_worked = True - raise InvalidRequestError( + raise BadRequestError( message=f"BedrockException - {error_str}", model=model, - llm_provider="bedrock" + llm_provider="bedrock", + response=original_exception.response ) if "Unable to locate credentials" in error_str or "The security token included in the request is invalid" in error_str: exception_mapping_worked = True raise AuthenticationError( message=f"BedrockException Invalid Authentication - {error_str}", model=model, - llm_provider="bedrock" + llm_provider="bedrock", + response=original_exception.response ) if "throttlingException" in error_str or "ThrottlingException" in error_str: exception_mapping_worked = True raise RateLimitError( message=f"BedrockException: Rate Limit Error - {error_str}", model=model, - llm_provider="bedrock" + llm_provider="bedrock", + response=original_exception.response ) if hasattr(original_exception, "status_code"): if original_exception.status_code == 500: @@ -3216,33 +3343,37 @@ def exception_type( raise AuthenticationError( message=f"BedrockException - {original_exception.message}", llm_provider="bedrock", - model=model + model=model, + response=original_exception.response ) elif custom_llm_provider == "sagemaker": if "Unable to locate credentials" in error_str: exception_mapping_worked = True - raise InvalidRequestError( + raise BadRequestError( message=f"SagemakerException - {error_str}", model=model, - llm_provider="sagemaker" + llm_provider="sagemaker", + response=original_exception.response ) elif custom_llm_provider == "vertex_ai": if "Vertex AI API has not been used in project" in error_str or "Unable to find your project" in error_str: exception_mapping_worked = True - raise InvalidRequestError( + raise BadRequestError( message=f"VertexAIException - {error_str}", model=model, - llm_provider="vertex_ai" + llm_provider="vertex_ai", + response=original_exception.response ) elif custom_llm_provider == "palm": if "503 Getting metadata" in error_str: # auth errors look like this # 503 Getting metadata from plugin failed with error: Reauthentication is needed. Please run `gcloud auth application-default login` to reauthenticate. exception_mapping_worked = True - raise InvalidRequestError( + raise BadRequestError( message=f"PalmException - Invalid api key", model=model, - llm_provider="palm" + llm_provider="palm", + response=original_exception.response ) if "400 Request payload size exceeds" in error_str: exception_mapping_worked = True @@ -3250,6 +3381,7 @@ def exception_type( message=f"PalmException - {error_str}", model=model, llm_provider="palm", + response=original_exception.response ) # Dailed: Error occurred: 400 Request payload size exceeds the limit: 20000 bytes elif custom_llm_provider == "cohere": # Cohere @@ -3261,7 +3393,8 @@ def exception_type( raise AuthenticationError( message=f"CohereException - {original_exception.message}", llm_provider="cohere", - model=model + model=model, + response=original_exception.response ) elif "too many tokens" in error_str: exception_mapping_worked = True @@ -3269,14 +3402,16 @@ def exception_type( message=f"CohereException - {original_exception.message}", model=model, llm_provider="cohere", + response=original_exception.response ) elif hasattr(original_exception, "status_code"): if original_exception.status_code == 400 or original_exception.status_code == 498: exception_mapping_worked = True - raise InvalidRequestError( + raise BadRequestError( message=f"CohereException - {original_exception.message}", llm_provider="cohere", - model=model + model=model, + response=original_exception.response ) elif original_exception.status_code == 500: exception_mapping_worked = True @@ -3292,14 +3427,16 @@ def exception_type( raise RateLimitError( message=f"CohereException - {original_exception.message}", llm_provider="cohere", - model=model + model=model, + response=original_exception.response ) elif "invalid type:" in error_str: exception_mapping_worked = True - raise InvalidRequestError( + raise BadRequestError( message=f"CohereException - {original_exception.message}", llm_provider="cohere", - model=model + model=model, + response=original_exception.response ) elif "Unexpected server error" in error_str: exception_mapping_worked = True @@ -3315,7 +3452,8 @@ def exception_type( status_code=original_exception.status_code, message=f"CohereException - {original_exception.message}", llm_provider="cohere", - model=model + model=model, + request=original_exception.request ) raise original_exception elif custom_llm_provider == "huggingface": @@ -3324,14 +3462,16 @@ def exception_type( raise ContextWindowExceededError( message=error_str, model=model, - llm_provider="huggingface" + llm_provider="huggingface", + response=original_exception.response ) elif "A valid user token is required" in error_str: exception_mapping_worked = True - raise InvalidRequestError( + raise BadRequestError( message=error_str, llm_provider="huggingface", - model=model + model=model, + response=original_exception.response ) if hasattr(original_exception, "status_code"): if original_exception.status_code == 401: @@ -3339,28 +3479,32 @@ def exception_type( raise AuthenticationError( message=f"HuggingfaceException - {original_exception.message}", llm_provider="huggingface", - model=model + model=model, + response=original_exception.response ) elif original_exception.status_code == 400: exception_mapping_worked = True - raise InvalidRequestError( + raise BadRequestError( message=f"HuggingfaceException - {original_exception.message}", model=model, llm_provider="huggingface", + response=original_exception.response ) elif original_exception.status_code == 408: exception_mapping_worked = True raise Timeout( message=f"HuggingfaceException - {original_exception.message}", model=model, - llm_provider="huggingface" + llm_provider="huggingface", + request=original_exception.request ) elif original_exception.status_code == 429: exception_mapping_worked = True raise RateLimitError( message=f"HuggingfaceException - {original_exception.message}", llm_provider="huggingface", - model=model + model=model, + response=original_exception.response ) else: exception_mapping_worked = True @@ -3368,7 +3512,8 @@ def exception_type( status_code=original_exception.status_code, message=f"HuggingfaceException - {original_exception.message}", llm_provider="huggingface", - model=model + model=model, + request=original_exception.request ) exception_mapping_worked = True raise APIError(status_code=500, message=error_str, model=model, llm_provider=custom_llm_provider) @@ -3379,14 +3524,16 @@ def exception_type( raise ContextWindowExceededError( message=f"AI21Exception - {original_exception.message}", model=model, - llm_provider="ai21" + llm_provider="ai21", + response=original_exception.response ) if "Bad or missing API token." in original_exception.message: exception_mapping_worked = True - raise InvalidRequestError( + raise BadRequestError( message=f"AI21Exception - {original_exception.message}", model=model, - llm_provider="ai21" + llm_provider="ai21", + response=original_exception.response ) if hasattr(original_exception, "status_code"): if original_exception.status_code == 401: @@ -3394,27 +3541,32 @@ def exception_type( raise AuthenticationError( message=f"AI21Exception - {original_exception.message}", llm_provider="ai21", - model=model + model=model, + response=original_exception.response ) elif original_exception.status_code == 408: exception_mapping_worked = True raise Timeout( message=f"AI21Exception - {original_exception.message}", model=model, - llm_provider="ai21" + llm_provider="ai21", + request=original_exception.request ) if original_exception.status_code == 422: exception_mapping_worked = True - raise InvalidRequestError( + raise BadRequestError( message=f"AI21Exception - {original_exception.message}", model=model, llm_provider="ai21", + response=original_exception.response ) elif original_exception.status_code == 429: exception_mapping_worked = True raise RateLimitError( message=f"AI21Exception - {original_exception.message}", llm_provider="ai21", + model=model, + response=original_exception.response ) else: exception_mapping_worked = True @@ -3422,7 +3574,8 @@ def exception_type( status_code=original_exception.status_code, message=f"AI21Exception - {original_exception.message}", llm_provider="ai21", - model=model + model=model, + request=original_exception.request ) elif custom_llm_provider == "nlp_cloud": if "detail" in error_str: @@ -3431,14 +3584,16 @@ def exception_type( raise ContextWindowExceededError( message=f"NLPCloudException - {error_str}", model=model, - llm_provider="nlp_cloud" + llm_provider="nlp_cloud", + response=original_exception.response ) elif "value is not a valid" in error_str: exception_mapping_worked = True - raise InvalidRequestError( + raise BadRequestError( message=f"NLPCloudException - {error_str}", model=model, - llm_provider="nlp_cloud" + llm_provider="nlp_cloud", + response=original_exception.response ) else: exception_mapping_worked = True @@ -3446,35 +3601,41 @@ def exception_type( status_code=500, message=f"NLPCloudException - {error_str}", model=model, - llm_provider="nlp_cloud" + llm_provider="nlp_cloud", + request=original_exception.request ) if hasattr(original_exception, "status_code"): # https://docs.nlpcloud.com/?shell#errors if original_exception.status_code == 400 or original_exception.status_code == 406 or original_exception.status_code == 413 or original_exception.status_code == 422: exception_mapping_worked = True - raise InvalidRequestError( + raise BadRequestError( message=f"NLPCloudException - {original_exception.message}", llm_provider="nlp_cloud", - model=model + model=model, + response=original_exception.response ) elif original_exception.status_code == 401 or original_exception.status_code == 403: exception_mapping_worked = True raise AuthenticationError( message=f"NLPCloudException - {original_exception.message}", llm_provider="nlp_cloud", - model=model + model=model, + response=original_exception.response ) elif original_exception.status_code == 522 or original_exception.status_code == 524: exception_mapping_worked = True raise Timeout( message=f"NLPCloudException - {original_exception.message}", model=model, - llm_provider="nlp_cloud" + llm_provider="nlp_cloud", + request=original_exception.request ) elif original_exception.status_code == 429 or original_exception.status_code == 402: exception_mapping_worked = True raise RateLimitError( message=f"NLPCloudException - {original_exception.message}", llm_provider="nlp_cloud", + model=model, + response=original_exception.response ) elif original_exception.status_code == 500 or original_exception.status_code == 503: exception_mapping_worked = True @@ -3482,7 +3643,8 @@ def exception_type( status_code=original_exception.status_code, message=f"NLPCloudException - {original_exception.message}", llm_provider="nlp_cloud", - model=model + model=model, + request=original_exception.request ) elif original_exception.status_code == 504 or original_exception.status_code == 520: exception_mapping_worked = True @@ -3497,7 +3659,8 @@ def exception_type( status_code=original_exception.status_code, message=f"NLPCloudException - {original_exception.message}", llm_provider="nlp_cloud", - model=model + model=model, + request=original_exception.request ) elif custom_llm_provider == "together_ai": import json @@ -3507,49 +3670,56 @@ def exception_type( raise ContextWindowExceededError( message=f"TogetherAIException - {error_response['error']}", model=model, - llm_provider="together_ai" + llm_provider="together_ai", + response=original_exception.response ) elif "error" in error_response and "invalid private key" in error_response["error"]: exception_mapping_worked = True raise AuthenticationError( message=f"TogetherAIException - {error_response['error']}", llm_provider="together_ai", - model=model + model=model, + response=original_exception.response ) elif "error" in error_response and "INVALID_ARGUMENT" in error_response["error"]: exception_mapping_worked = True - raise InvalidRequestError( + raise BadRequestError( message=f"TogetherAIException - {error_response['error']}", model=model, - llm_provider="together_ai" + llm_provider="together_ai", + response=original_exception.response ) elif "error" in error_response and "API key doesn't match expected format." in error_response["error"]: exception_mapping_worked = True - raise InvalidRequestError( + raise BadRequestError( message=f"TogetherAIException - {error_response['error']}", model=model, - llm_provider="together_ai" + llm_provider="together_ai", + response=original_exception.response ) elif "error_type" in error_response and error_response["error_type"] == "validation": exception_mapping_worked = True - raise InvalidRequestError( + raise BadRequestError( message=f"TogetherAIException - {error_response['error']}", model=model, - llm_provider="together_ai" + llm_provider="together_ai", + response=original_exception.response ) elif original_exception.status_code == 408: exception_mapping_worked = True raise Timeout( message=f"TogetherAIException - {original_exception.message}", model=model, - llm_provider="together_ai" + llm_provider="together_ai", + request=original_exception.request ) elif original_exception.status_code == 429: exception_mapping_worked = True raise RateLimitError( message=f"TogetherAIException - {original_exception.message}", llm_provider="together_ai", - model=model + model=model, + response=original_exception.response ) else: exception_mapping_worked = True @@ -3557,7 +3727,8 @@ def exception_type( status_code=original_exception.status_code, message=f"TogetherAIException - {original_exception.message}", llm_provider="together_ai", - model=model + model=model, + request=original_exception.request ) elif custom_llm_provider == "aleph_alpha": if "This is longer than the model's maximum context length" in error_str: @@ -3565,14 +3736,16 @@ def exception_type( raise ContextWindowExceededError( message=f"AlephAlphaException - {original_exception.message}", llm_provider="aleph_alpha", - model=model + model=model, + response=original_exception.response ) elif "InvalidToken" in error_str or "No token provided" in error_str: exception_mapping_worked = True - raise InvalidRequestError( + raise BadRequestError( message=f"AlephAlphaException - {original_exception.message}", llm_provider="aleph_alpha", - model=model + model=model, + response=original_exception.response ) elif hasattr(original_exception, "status_code"): print_verbose(f"status code: {original_exception.status_code}") @@ -3585,17 +3758,19 @@ def exception_type( ) elif original_exception.status_code == 400: exception_mapping_worked = True - raise InvalidRequestError( + raise BadRequestError( message=f"AlephAlphaException - {original_exception.message}", llm_provider="aleph_alpha", - model=model + model=model, + response=original_exception.response ) elif original_exception.status_code == 429: exception_mapping_worked = True raise RateLimitError( message=f"AlephAlphaException - {original_exception.message}", llm_provider="aleph_alpha", - model=model + model=model, + response=original_exception.response ) elif original_exception.status_code == 500: exception_mapping_worked = True @@ -3616,10 +3791,11 @@ def exception_type( error_str = str(original_exception) if "no such file or directory" in error_str: exception_mapping_worked = True - raise InvalidRequestError( + raise BadRequestError( message=f"OllamaException: Invalid Model/Model not loaded - {original_exception}", model=model, - llm_provider="ollama" + llm_provider="ollama", + response=original_exception.response ) elif "Failed to establish a new connection" in error_str: exception_mapping_worked = True @@ -3630,10 +3806,11 @@ def exception_type( ) elif "Invalid response object from API" in error_str: exception_mapping_worked = True - raise InvalidRequestError( + raise BadRequestError( message=f"OllamaException: {original_exception}", llm_provider="ollama", - model=model + model=model, + response=original_exception.response ) elif custom_llm_provider == "vllm": if hasattr(original_exception, "status_code"): @@ -3650,14 +3827,16 @@ def exception_type( raise ContextWindowExceededError( message=f"AzureException - {original_exception.message}", llm_provider="azure", - model=model + model=model, + response=original_exception.response ) elif "invalid_request_error" in error_str: exception_mapping_worked = True - raise InvalidRequestError( + raise BadRequestError( message=f"AzureException - {original_exception.message}", llm_provider="azure", - model=model + model=model, + response=original_exception.response ) elif hasattr(original_exception, "status_code"): exception_mapping_worked = True @@ -3673,14 +3852,16 @@ def exception_type( raise Timeout( message=f"AzureException - {original_exception.message}", model=model, - llm_provider="azure" + llm_provider="azure", + request=original_exception.request ) if original_exception.status_code == 422: exception_mapping_worked = True - raise InvalidRequestError( + raise BadRequestError( message=f"AzureException - {original_exception.message}", model=model, llm_provider="azure", + response=original_exception.response ) elif original_exception.status_code == 429: exception_mapping_worked = True @@ -3688,6 +3869,7 @@ def exception_type( message=f"AzureException - {original_exception.message}", model=model, llm_provider="azure", + response=original_exception.response ) else: exception_mapping_worked = True @@ -3695,7 +3877,8 @@ def exception_type( status_code=original_exception.status_code, message=f"AzureException - {original_exception.message}", llm_provider="azure", - model=model + model=model, + request=original_exception.request ) elif custom_llm_provider == "custom_openai" or custom_llm_provider == "maritalk": if hasattr(original_exception, "status_code"): @@ -3712,14 +3895,16 @@ def exception_type( raise Timeout( message=f"CustomOpenAIException - {original_exception.message}", model=model, - llm_provider="custom_openai" + llm_provider="custom_openai", + request=original_exception.request ) if original_exception.status_code == 422: exception_mapping_worked = True - raise InvalidRequestError( + raise BadRequestError( message=f"CustomOpenAIException - {original_exception.message}", model=model, llm_provider="custom_openai", + response=original_exception.response ) elif original_exception.status_code == 429: exception_mapping_worked = True @@ -3727,6 +3912,7 @@ def exception_type( message=f"CustomOpenAIException - {original_exception.message}", model=model, llm_provider="custom_openai", + response=original_exception.response ) else: exception_mapping_worked = True @@ -3734,17 +3920,20 @@ def exception_type( status_code=original_exception.status_code, message=f"CustomOpenAIException - {original_exception.message}", llm_provider="custom_openai", - model=model + model=model, + request=original_exception.request ) exception_mapping_worked = True - if "InvalidRequestError.__init__() missing 1 required positional argument: 'param'" in str(original_exception): # deal with edge-case invalid request error bug in openai-python sdk - raise InvalidRequestError( + if "BadRequestError.__init__() missing 1 required positional argument: 'param'" in str(original_exception): # deal with edge-case invalid request error bug in openai-python sdk + raise BadRequestError( message=f"OpenAIException: This can happen due to missing AZURE_API_VERSION: {str(original_exception)}", model=model, - llm_provider=custom_llm_provider + llm_provider=custom_llm_provider, + response=original_exception.response ) else: - raise APIError(status_code=500, message=str(original_exception), llm_provider=custom_llm_provider, model=model) + # raise BadRequestError(message=str(original_exception), llm_provider=custom_llm_provider, model=model, response=original_exception.response) + raise original_exception except Exception as e: # LOGGING exception_logging( @@ -4079,7 +4268,7 @@ class CustomStreamWrapper: def handle_openai_chat_completion_chunk(self, chunk): try: - str_line = chunk.decode("utf-8") # Convert bytes to string + str_line = chunk text = "" is_finished = False finish_reason = None @@ -4110,11 +4299,17 @@ class CustomStreamWrapper: def handle_openai_text_completion_chunk(self, chunk): try: - str_line = chunk.decode("utf-8") # Convert bytes to string + # str_line = chunk.decode("utf-8") # Convert bytes to string + str_line = chunk text = "" is_finished = False finish_reason = None - if str_line.startswith("data:"): + if "data: [DONE]" in str_line: + text = "" + is_finished = True + finish_reason = "stop" + return {"text": text, "is_finished": is_finished, "finish_reason": finish_reason} + elif str_line.startswith("data:"): data_json = json.loads(str_line[5:]) print_verbose(f"delta content: {data_json['choices'][0]['text']}") text = data_json["choices"][0].get("text", "") @@ -4194,7 +4389,6 @@ class CustomStreamWrapper: def chunk_creator(self, chunk): model_response = ModelResponse(stream=True, model=self.model) try: - # return this for all models completion_obj = {"content": ""} if self.custom_llm_provider and self.custom_llm_provider == "anthropic": @@ -4364,18 +4558,24 @@ class CustomStreamWrapper: ## needs to handle the empty string case (even starting chunk can be an empty string) def __next__(self): while True: # loop until a non-empty string is found - if isinstance(self.completion_stream, str): - chunk = self.completion_stream - else: + try: + # if isinstance(self.completion_stream, str): + # chunk = self.completion_stream + # else: chunk = next(self.completion_stream) - response = self.chunk_creator(chunk=chunk) - if response is not None: + response = self.chunk_creator(chunk=chunk) + # if response is not None: return response + except Exception as e: + raise StopIteration async def __anext__(self): try: - if self.custom_llm_provider == "openai" or self.custom_llm_provider == "azure": - async for chunk in self.completion_stream.content: + if (self.custom_llm_provider == "openai" + or self.custom_llm_provider == "azure" + or self.custom_llm_provider == "custom_openai" + or self.custom_llm_provider == "text-completion-openai"): + async for chunk in self.completion_stream: if chunk == "None" or chunk is None: raise Exception processed_chunk = self.chunk_creator(chunk=chunk)