diff --git a/litellm/main.py b/litellm/main.py index e98d88656..ffe562641 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -1313,11 +1313,12 @@ def completion( or model in litellm.petals_models ): api_base = ( - litellm.api_base or - api_base + api_base or + litellm.api_base ) custom_llm_provider = "petals" + stream = optional_params.pop("stream", False) model_response = petals.completion( model=model, messages=messages, diff --git a/litellm/router.py b/litellm/router.py index fab7e58c2..9135406fd 100644 --- a/litellm/router.py +++ b/litellm/router.py @@ -403,7 +403,7 @@ class Router: # if the function call is successful, no exception will be raised and we'll break out of the loop response = await original_function(*args, **kwargs) return response - except (Exception, asyncio.CancelledError) as e: + except Exception as e: original_exception = e ### CHECK IF RATE LIMIT / CONTEXT WINDOW ERROR w/ fallbacks available if ((isinstance(original_exception, litellm.ContextWindowExceededError) and context_window_fallbacks is None) @@ -411,10 +411,7 @@ class Router: raise original_exception ### RETRY #### check if it should retry + back-off if required - if isinstance(original_exception, asyncio.CancelledError): - timeout = 0 # immediately retry - await asyncio.sleep(timeout) - elif hasattr(original_exception, "status_code") and hasattr(original_exception, "response") and litellm._should_retry(status_code=original_exception.status_code): + if hasattr(original_exception, "status_code") and hasattr(original_exception, "response") and litellm._should_retry(status_code=original_exception.status_code): if hasattr(original_exception.response, "headers"): timeout = litellm._calculate_retry_after(remaining_retries=num_retries, max_retries=num_retries, response_headers=original_exception.response.headers) else: @@ -432,11 +429,8 @@ class Router: response = await response return response - except (Exception, asyncio.CancelledError) as e: - if isinstance(original_exception, asyncio.CancelledError): - timeout = 0 # immediately retry - await asyncio.sleep(timeout) - elif hasattr(e, "status_code") and hasattr(e, "response") and litellm._should_retry(status_code=e.status_code): + except Exception as e: + if hasattr(e, "status_code") and hasattr(e, "response") and litellm._should_retry(status_code=e.status_code): remaining_retries = num_retries - current_attempt if hasattr(e.response, "headers"): timeout = litellm._calculate_retry_after(remaining_retries=num_retries, max_retries=num_retries, response_headers=e.response.headers) diff --git a/litellm/tests/test_streaming.py b/litellm/tests/test_streaming.py index 597020942..1f51e54bb 100644 --- a/litellm/tests/test_streaming.py +++ b/litellm/tests/test_streaming.py @@ -631,7 +631,7 @@ def test_completion_bedrock_ai21_stream(): except Exception as e: pytest.fail(f"Error occurred: {e}") -test_completion_bedrock_ai21_stream() +# test_completion_bedrock_ai21_stream() # def test_completion_sagemaker_stream(): # try: @@ -760,6 +760,17 @@ def hf_test_completion_tgi_stream(): pytest.fail(f"Error occurred: {e}") # hf_test_completion_tgi_stream() +def test_petals(): + print(f"making petals call") + response = completion( + model="petals/meta-llama/Llama-2-70b-chat-hf", + messages=[{ "content": "Hello, how are you?","role": "user"}], + # stream=True, + base_url="https://chat.petals.dev/api/v1/generate" + ) + + print(f"response: {response}") +test_petals() # def test_completion_aleph_alpha(): # try: # response = completion(