diff --git a/.gitignore b/.gitignore index e3e1bee69..1278b7867 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,4 @@ litellm/proxy/api_log.json .idea/ router_config.yaml litellm_server/config.yaml +litellm/proxy/_secret_config.yaml diff --git a/docs/my-website/docs/routing.md b/docs/my-website/docs/routing.md index 20dfdcdb8..2f19b4bdb 100644 --- a/docs/my-website/docs/routing.md +++ b/docs/my-website/docs/routing.md @@ -1,8 +1,6 @@ import Image from '@theme/IdealImage'; -# Reliability - Fallbacks, Azure Deployments, etc. - -## Manage Multiple Deployments +# Manage Multiple Deployments Use this if you're trying to load-balance across multiple deployments (e.g. Azure/OpenAI). diff --git a/litellm/exceptions.py b/litellm/exceptions.py index 941d79bd2..999d9baa0 100644 --- a/litellm/exceptions.py +++ b/litellm/exceptions.py @@ -110,12 +110,13 @@ class APIError(APIError): # type: ignore # raised if an invalid request (not get, delete, put, post) is made class APIConnectionError(APIConnectionError): # type: ignore - def __init__(self, message, llm_provider, model): + def __init__(self, message, llm_provider, model, request: httpx.Request): self.message = message self.llm_provider = llm_provider self.model = model super().__init__( - self.message + message=self.message, + request=request ) class OpenAIError(OpenAIError): # type: ignore diff --git a/litellm/llms/azure.py b/litellm/llms/azure.py index 9ec140a50..b95f05a13 100644 --- a/litellm/llms/azure.py +++ b/litellm/llms/azure.py @@ -195,7 +195,7 @@ class AzureChatCompletion(BaseLLM): method="POST" ) as response: if response.status_code != 200: - raise AzureOpenAIError(status_code=response.status_code, message=response.text) + raise AzureOpenAIError(status_code=response.status_code, message="An error occurred while streaming") completion_stream = response.iter_lines() streamwrapper = CustomStreamWrapper(completion_stream=completion_stream, model=model, custom_llm_provider="azure",logging_obj=logging_obj) diff --git a/litellm/llms/vllm.py b/litellm/llms/vllm.py index 47144bf2f..ce391d4b5 100644 --- a/litellm/llms/vllm.py +++ b/litellm/llms/vllm.py @@ -2,7 +2,7 @@ import os import json from enum import Enum import requests -import time +import time, httpx from typing import Callable, Any from litellm.utils import ModelResponse, Usage from .prompt_templates.factory import prompt_factory, custom_prompt @@ -11,6 +11,8 @@ class VLLMError(Exception): def __init__(self, status_code, message): self.status_code = status_code self.message = message + self.request = httpx.Request(method="POST", url="http://0.0.0.0:8000") + self.response = httpx.Response(status_code=status_code, request=self.request) super().__init__( self.message ) # Call the base class constructor with the parameters it needs diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py index bc402e0ac..1519769d4 100644 --- a/litellm/proxy/proxy_server.py +++ b/litellm/proxy/proxy_server.py @@ -453,25 +453,18 @@ def litellm_completion(*args, **kwargs): kwargs["max_tokens"] = user_max_tokens if user_api_base: kwargs["api_base"] = user_api_base - ## CHECK CONFIG ## - if llm_model_list != None: - llm_models = [m["model_name"] for m in llm_model_list] - if kwargs["model"] in llm_models: - for m in llm_model_list: - if kwargs["model"] == m["model_name"]: # if user has specified a config, this will use the config - for key, value in m["litellm_params"].items(): - kwargs[key] = value - break - else: - print_verbose("user sent model not in config, using default config model") - default_model = llm_model_list[0] - litellm_params = default_model.get('litellm_params', None) - for key, value in litellm_params.items(): - kwargs[key] = value - if call_type == "chat_completion": - response = litellm.completion(*args, **kwargs) - elif call_type == "text_completion": - response = litellm.text_completion(*args, **kwargs) + ## ROUTE TO CORRECT ENDPOINT ## + router_model_names = [m["model_name"] for m in llm_model_list] + if llm_router is not None and kwargs["model"] in router_model_names: # model in router model list + if call_type == "chat_completion": + response = llm_router.completion(*args, **kwargs) + elif call_type == "text_completion": + response = llm_router.text_completion(*args, **kwargs) + else: + if call_type == "chat_completion": + response = litellm.completion(*args, **kwargs) + elif call_type == "text_completion": + response = litellm.text_completion(*args, **kwargs) if 'stream' in kwargs and kwargs['stream'] == True: # use generate_responses to stream responses return StreamingResponse(data_generator(response), media_type='text/event-stream') return response diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py index a7af44534..ec5c3ef5b 100644 --- a/litellm/tests/test_completion.py +++ b/litellm/tests/test_completion.py @@ -579,36 +579,34 @@ def test_completion_openai_with_more_optional_params(): pytest.fail(f"Error occurred: {e}") # test_completion_openai_with_more_optional_params() -# def test_completion_openai_azure_with_functions(): -# function1 = [ -# { -# "name": "get_current_weather", -# "description": "Get the current weather in a given location", -# "parameters": { -# "type": "object", -# "properties": { -# "location": { -# "type": "string", -# "description": "The city and state, e.g. San Francisco, CA", -# }, -# "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}, -# }, -# "required": ["location"], -# }, -# } -# ] -# try: -# response = completion( -# model="azure/chatgpt-functioncalling", messages=messages, stream=True -# ) -# # Add any assertions here to check the response -# print(response) -# for chunk in response: -# print(chunk) -# print(chunk["choices"][0]["finish_reason"]) -# except Exception as e: -# pytest.fail(f"Error occurred: {e}") -# test_completion_openai_azure_with_functions() +def test_completion_openai_azure_with_functions(): + function1 = [ + { + "name": "get_current_weather", + "description": "Get the current weather in a given location", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The city and state, e.g. San Francisco, CA", + }, + "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}, + }, + "required": ["location"], + }, + } + ] + try: + messages = [{"role": "user", "content": "What is the weather like in Boston?"}] + response = completion( + model="azure/chatgpt-functioncalling", messages=messages, functions=function1 + ) + # Add any assertions here to check the response + print(response) + except Exception as e: + pytest.fail(f"Error occurred: {e}") +test_completion_openai_azure_with_functions() def test_completion_azure(): diff --git a/litellm/utils.py b/litellm/utils.py index 4b75cc8c1..a9cc6e465 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -2896,7 +2896,7 @@ def convert_to_model_response_object(response_object: Optional[dict]=None, model raise Exception("Error in response object format") choice_list=[] for idx, choice in enumerate(response_object["choices"]): - message = Message(content=choice["message"]["content"], role=choice["message"]["role"], function_call=choice["message"].get("function_call", None)) + message = Message(content=choice["message"].get("content", None), role=choice["message"]["role"], function_call=choice["message"].get("function_call", None)) finish_reason = choice.get("finish_reason", None) if finish_reason == None: # gpt-4 vision can return 'finish_reason' or 'finish_details' @@ -4018,7 +4018,8 @@ def exception_type( raise APIConnectionError( message=f"VLLMException - {original_exception.message}", llm_provider="vllm", - model=model + model=model, + request=original_exception.request ) elif custom_llm_provider == "azure": if "This model's maximum context length is" in error_str: @@ -4093,7 +4094,8 @@ def exception_type( raise APIConnectionError( message=f"{str(original_exception)}", llm_provider=custom_llm_provider, - model=model + model=model, + request=original_exception.request ) except Exception as e: # LOGGING