Merge branch 'main' into litellm_fix_httpx_transport

This commit is contained in:
Krish Dholakia 2024-07-02 17:17:43 -07:00 committed by GitHub
commit d38f01e956
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
189 changed files with 8377 additions and 1087 deletions

View file

@ -50,12 +50,15 @@ from tokenizers import Tokenizer
import litellm
import litellm._service_logger # for storing API inputs, outputs, and metadata
import litellm.litellm_core_utils
import litellm.litellm_core_utils.json_validation_rule
from litellm.caching import DualCache
from litellm.litellm_core_utils.core_helpers import map_finish_reason
from litellm.litellm_core_utils.exception_mapping_utils import get_error_message
from litellm.litellm_core_utils.llm_request_utils import _ensure_extra_body_is_safe
from litellm.litellm_core_utils.redact_messages import (
redact_message_input_output_from_logging,
)
from litellm.litellm_core_utils.token_counter import get_modified_max_tokens
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
from litellm.types.utils import (
CallTypes,
@ -580,7 +583,7 @@ def client(original_function):
else:
return False
def post_call_processing(original_response, model):
def post_call_processing(original_response, model, optional_params: Optional[dict]):
try:
if original_response is None:
pass
@ -595,11 +598,47 @@ def client(original_function):
pass
else:
if isinstance(original_response, ModelResponse):
model_response = original_response.choices[
model_response: Optional[str] = original_response.choices[
0
].message.content
### POST-CALL RULES ###
rules_obj.post_call_rules(input=model_response, model=model)
].message.content # type: ignore
if model_response is not None:
### POST-CALL RULES ###
rules_obj.post_call_rules(
input=model_response, model=model
)
### JSON SCHEMA VALIDATION ###
if (
optional_params is not None
and "response_format" in optional_params
and isinstance(
optional_params["response_format"], dict
)
and "type" in optional_params["response_format"]
and optional_params["response_format"]["type"]
== "json_object"
and "response_schema"
in optional_params["response_format"]
and isinstance(
optional_params["response_format"][
"response_schema"
],
dict,
)
and "enforce_validation"
in optional_params["response_format"]
and optional_params["response_format"][
"enforce_validation"
]
is True
):
# schema given, json response expected, and validation enforced
litellm.litellm_core_utils.json_validation_rule.validate_schema(
schema=optional_params["response_format"][
"response_schema"
],
response=model_response,
)
except Exception as e:
raise e
@ -815,7 +854,7 @@ def client(original_function):
kwargs.get("max_tokens", None) is not None
and model is not None
and litellm.modify_params
== True # user is okay with params being modified
is True # user is okay with params being modified
and (
call_type == CallTypes.acompletion.value
or call_type == CallTypes.completion.value
@ -825,28 +864,19 @@ def client(original_function):
base_model = model
if kwargs.get("hf_model_name", None) is not None:
base_model = f"huggingface/{kwargs.get('hf_model_name')}"
max_output_tokens = (
get_max_tokens(model=base_model) or 4096
) # assume min context window is 4k tokens
user_max_tokens = kwargs.get("max_tokens")
## Scenario 1: User limit + prompt > model limit
messages = None
if len(args) > 1:
messages = args[1]
elif kwargs.get("messages", None):
messages = kwargs["messages"]
input_tokens = token_counter(model=base_model, messages=messages)
input_tokens += max(
0.1 * input_tokens, 10
) # give at least a 10 token buffer. token counting can be imprecise.
if input_tokens > max_output_tokens:
pass # allow call to fail normally
elif user_max_tokens + input_tokens > max_output_tokens:
user_max_tokens = max_output_tokens - input_tokens
print_verbose(f"user_max_tokens: {user_max_tokens}")
kwargs["max_tokens"] = int(
round(user_max_tokens)
) # make sure max tokens is always an int
user_max_tokens = kwargs.get("max_tokens")
modified_max_tokens = get_modified_max_tokens(
model=model,
base_model=base_model,
messages=messages,
user_max_tokens=user_max_tokens,
)
kwargs["max_tokens"] = modified_max_tokens
except Exception as e:
print_verbose(f"Error while checking max token limit: {str(e)}")
# MODEL CALL
@ -877,7 +907,11 @@ def client(original_function):
return result
### POST-CALL RULES ###
post_call_processing(original_response=result, model=model or None)
post_call_processing(
original_response=result,
model=model or None,
optional_params=kwargs,
)
# [OPTIONAL] ADD TO CACHE
if (
@ -901,6 +935,17 @@ def client(original_function):
model=model,
optional_params=getattr(logging_obj, "optional_params", {}),
)
result._hidden_params["response_cost"] = (
litellm.response_cost_calculator(
response_object=result,
model=getattr(logging_obj, "model", ""),
custom_llm_provider=getattr(
logging_obj, "custom_llm_provider", None
),
call_type=getattr(logging_obj, "call_type", "completion"),
optional_params=getattr(logging_obj, "optional_params", {}),
)
)
result._response_ms = (
end_time - start_time
).total_seconds() * 1000 # return response latency in ms like openai
@ -1294,6 +1339,17 @@ def client(original_function):
model=model,
optional_params=kwargs,
)
result._hidden_params["response_cost"] = (
litellm.response_cost_calculator(
response_object=result,
model=getattr(logging_obj, "model", ""),
custom_llm_provider=getattr(
logging_obj, "custom_llm_provider", None
),
call_type=getattr(logging_obj, "call_type", "completion"),
optional_params=getattr(logging_obj, "optional_params", {}),
)
)
if (
isinstance(result, ModelResponse)
or isinstance(result, EmbeddingResponse)
@ -1304,7 +1360,9 @@ def client(original_function):
).total_seconds() * 1000 # return response latency in ms like openai
### POST-CALL RULES ###
post_call_processing(original_response=result, model=model)
post_call_processing(
original_response=result, model=model, optional_params=kwargs
)
# [OPTIONAL] ADD TO CACHE
if (
@ -1835,9 +1893,10 @@ def supports_system_messages(model: str, custom_llm_provider: Optional[str]) ->
Parameters:
model (str): The model name to be checked.
custom_llm_provider (str): The provider to be checked.
Returns:
bool: True if the model supports function calling, False otherwise.
bool: True if the model supports system messages, False otherwise.
Raises:
Exception: If the given model is not found in model_prices_and_context_window.json.
@ -1855,6 +1914,43 @@ def supports_system_messages(model: str, custom_llm_provider: Optional[str]) ->
)
def supports_response_schema(model: str, custom_llm_provider: Optional[str]) -> bool:
"""
Check if the given model + provider supports 'response_schema' as a param.
Parameters:
model (str): The model name to be checked.
custom_llm_provider (str): The provider to be checked.
Returns:
bool: True if the model supports response_schema, False otherwise.
Does not raise error. Defaults to 'False'. Outputs logging.error.
"""
try:
## GET LLM PROVIDER ##
model, custom_llm_provider, _, _ = get_llm_provider(
model=model, custom_llm_provider=custom_llm_provider
)
if custom_llm_provider == "predibase": # predibase supports this globally
return True
## GET MODEL INFO
model_info = litellm.get_model_info(
model=model, custom_llm_provider=custom_llm_provider
)
if model_info.get("supports_response_schema", False) is True:
return True
return False
except Exception:
verbose_logger.error(
f"Model not in model_prices_and_context_window.json. You passed model={model}, custom_llm_provider={custom_llm_provider}."
)
return False
def supports_function_calling(model: str) -> bool:
"""
Check if the given model supports function calling and return a boolean value.
@ -2312,7 +2408,9 @@ def get_optional_params(
elif k == "hf_model_name" and custom_llm_provider != "sagemaker":
continue
elif (
k.startswith("vertex_") and custom_llm_provider != "vertex_ai"
k.startswith("vertex_")
and custom_llm_provider != "vertex_ai"
and custom_llm_provider != "vertex_ai_beta"
): # allow dynamically setting vertex ai init logic
continue
passed_params[k] = v
@ -2415,6 +2513,7 @@ def get_optional_params(
and custom_llm_provider != "together_ai"
and custom_llm_provider != "groq"
and custom_llm_provider != "nvidia_nim"
and custom_llm_provider != "volcengine"
and custom_llm_provider != "deepseek"
and custom_llm_provider != "codestral"
and custom_llm_provider != "mistral"
@ -2743,6 +2842,11 @@ def get_optional_params(
non_default_params=non_default_params,
optional_params=optional_params,
model=model,
drop_params=(
drop_params
if drop_params is not None and isinstance(drop_params, bool)
else False
),
)
elif (
custom_llm_provider == "vertex_ai" and model in litellm.vertex_anthropic_models
@ -2811,12 +2915,7 @@ def get_optional_params(
optional_params=optional_params,
)
)
else:
optional_params = litellm.AmazonAnthropicConfig().map_openai_params(
non_default_params=non_default_params,
optional_params=optional_params,
)
else: # bedrock httpx route
elif model in litellm.BEDROCK_CONVERSE_MODELS:
optional_params = litellm.AmazonConverseConfig().map_openai_params(
model=model,
non_default_params=non_default_params,
@ -2827,6 +2926,11 @@ def get_optional_params(
else False
),
)
else:
optional_params = litellm.AmazonAnthropicConfig().map_openai_params(
non_default_params=non_default_params,
optional_params=optional_params,
)
elif "amazon" in model: # amazon titan llms
_check_valid_arg(supported_params=supported_params)
# see https://us-west-2.console.aws.amazon.com/bedrock/home?region=us-west-2#/providers?model=titan-large
@ -3091,6 +3195,17 @@ def get_optional_params(
optional_params=optional_params,
model=model,
)
elif custom_llm_provider == "volcengine":
supported_params = get_supported_openai_params(
model=model, custom_llm_provider=custom_llm_provider
)
_check_valid_arg(supported_params=supported_params)
optional_params = litellm.VolcEngineConfig().map_openai_params(
non_default_params=non_default_params,
optional_params=optional_params,
model=model,
)
elif custom_llm_provider == "groq":
supported_params = get_supported_openai_params(
model=model, custom_llm_provider=custom_llm_provider
@ -3661,6 +3776,8 @@ def get_supported_openai_params(
return litellm.FireworksAIConfig().get_supported_openai_params()
elif custom_llm_provider == "nvidia_nim":
return litellm.NvidiaNimConfig().get_supported_openai_params()
elif custom_llm_provider == "volcengine":
return litellm.VolcEngineConfig().get_supported_openai_params(model=model)
elif custom_llm_provider == "groq":
return [
"temperature",
@ -3672,6 +3789,8 @@ def get_supported_openai_params(
"tool_choice",
"response_format",
"seed",
"extra_headers",
"extra_body",
]
elif custom_llm_provider == "deepseek":
return [
@ -3727,23 +3846,18 @@ def get_supported_openai_params(
return litellm.AzureOpenAIConfig().get_supported_openai_params()
elif custom_llm_provider == "openrouter":
return [
"functions",
"function_call",
"temperature",
"top_p",
"n",
"stream",
"stop",
"max_tokens",
"presence_penalty",
"frequency_penalty",
"logit_bias",
"user",
"response_format",
"presence_penalty",
"repetition_penalty",
"seed",
"tools",
"tool_choice",
"max_retries",
"max_tokens",
"logit_bias",
"logprobs",
"top_logprobs",
"response_format",
"stop",
]
elif custom_llm_provider == "mistral" or custom_llm_provider == "codestral":
# mistal and codestral api have the exact same params
@ -3761,6 +3875,10 @@ def get_supported_openai_params(
"top_p",
"stop",
"seed",
"tools",
"tool_choice",
"functions",
"function_call",
]
elif custom_llm_provider == "huggingface":
return litellm.HuggingfaceConfig().get_supported_openai_params()
@ -4025,6 +4143,10 @@ def get_llm_provider(
# nvidia_nim is openai compatible, we just need to set this to custom_openai and have the api_base be https://api.endpoints.anyscale.com/v1
api_base = "https://integrate.api.nvidia.com/v1"
dynamic_api_key = get_secret("NVIDIA_NIM_API_KEY")
elif custom_llm_provider == "volcengine":
# volcengine is openai compatible, we just need to set this to custom_openai and have the api_base be https://api.endpoints.anyscale.com/v1
api_base = "https://ark.cn-beijing.volces.com/api/v3"
dynamic_api_key = get_secret("VOLCENGINE_API_KEY")
elif custom_llm_provider == "codestral":
# codestral is openai compatible, we just need to set this to custom_openai and have the api_base be https://codestral.mistral.ai/v1
api_base = "https://codestral.mistral.ai/v1"
@ -4334,7 +4456,7 @@ def get_utc_datetime():
return datetime.utcnow() # type: ignore
def get_max_tokens(model: str):
def get_max_tokens(model: str) -> Optional[int]:
"""
Get the maximum number of output tokens allowed for a given model.
@ -4388,7 +4510,8 @@ def get_max_tokens(model: str):
return litellm.model_cost[model]["max_tokens"]
else:
raise Exception()
except:
return None
except Exception:
raise Exception(
f"Model {model} isn't mapped yet. Add it here - https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json"
)
@ -4396,8 +4519,7 @@ def get_max_tokens(model: str):
def get_model_info(model: str, custom_llm_provider: Optional[str] = None) -> ModelInfo:
"""
Get a dict for the maximum tokens (context window),
input_cost_per_token, output_cost_per_token for a given model.
Get a dict for the maximum tokens (context window), input_cost_per_token, output_cost_per_token for a given model.
Parameters:
- model (str): The name of the model.
@ -4482,6 +4604,7 @@ def get_model_info(model: str, custom_llm_provider: Optional[str] = None) -> Mod
mode="chat",
supported_openai_params=supported_openai_params,
supports_system_messages=None,
supports_response_schema=None,
)
else:
"""
@ -4503,36 +4626,6 @@ def get_model_info(model: str, custom_llm_provider: Optional[str] = None) -> Mod
pass
else:
raise Exception
return ModelInfo(
max_tokens=_model_info.get("max_tokens", None),
max_input_tokens=_model_info.get("max_input_tokens", None),
max_output_tokens=_model_info.get("max_output_tokens", None),
input_cost_per_token=_model_info.get("input_cost_per_token", 0),
input_cost_per_character=_model_info.get(
"input_cost_per_character", None
),
input_cost_per_token_above_128k_tokens=_model_info.get(
"input_cost_per_token_above_128k_tokens", None
),
output_cost_per_token=_model_info.get("output_cost_per_token", 0),
output_cost_per_character=_model_info.get(
"output_cost_per_character", None
),
output_cost_per_token_above_128k_tokens=_model_info.get(
"output_cost_per_token_above_128k_tokens", None
),
output_cost_per_character_above_128k_tokens=_model_info.get(
"output_cost_per_character_above_128k_tokens", None
),
litellm_provider=_model_info.get(
"litellm_provider", custom_llm_provider
),
mode=_model_info.get("mode"),
supported_openai_params=supported_openai_params,
supports_system_messages=_model_info.get(
"supports_system_messages", None
),
)
elif model in litellm.model_cost:
_model_info = litellm.model_cost[model]
_model_info["supported_openai_params"] = supported_openai_params
@ -4546,36 +4639,6 @@ def get_model_info(model: str, custom_llm_provider: Optional[str] = None) -> Mod
pass
else:
raise Exception
return ModelInfo(
max_tokens=_model_info.get("max_tokens", None),
max_input_tokens=_model_info.get("max_input_tokens", None),
max_output_tokens=_model_info.get("max_output_tokens", None),
input_cost_per_token=_model_info.get("input_cost_per_token", 0),
input_cost_per_character=_model_info.get(
"input_cost_per_character", None
),
input_cost_per_token_above_128k_tokens=_model_info.get(
"input_cost_per_token_above_128k_tokens", None
),
output_cost_per_token=_model_info.get("output_cost_per_token", 0),
output_cost_per_character=_model_info.get(
"output_cost_per_character", None
),
output_cost_per_token_above_128k_tokens=_model_info.get(
"output_cost_per_token_above_128k_tokens", None
),
output_cost_per_character_above_128k_tokens=_model_info.get(
"output_cost_per_character_above_128k_tokens", None
),
litellm_provider=_model_info.get(
"litellm_provider", custom_llm_provider
),
mode=_model_info.get("mode"),
supported_openai_params=supported_openai_params,
supports_system_messages=_model_info.get(
"supports_system_messages", None
),
)
elif split_model in litellm.model_cost:
_model_info = litellm.model_cost[split_model]
_model_info["supported_openai_params"] = supported_openai_params
@ -4589,40 +4652,48 @@ def get_model_info(model: str, custom_llm_provider: Optional[str] = None) -> Mod
pass
else:
raise Exception
return ModelInfo(
max_tokens=_model_info.get("max_tokens", None),
max_input_tokens=_model_info.get("max_input_tokens", None),
max_output_tokens=_model_info.get("max_output_tokens", None),
input_cost_per_token=_model_info.get("input_cost_per_token", 0),
input_cost_per_character=_model_info.get(
"input_cost_per_character", None
),
input_cost_per_token_above_128k_tokens=_model_info.get(
"input_cost_per_token_above_128k_tokens", None
),
output_cost_per_token=_model_info.get("output_cost_per_token", 0),
output_cost_per_character=_model_info.get(
"output_cost_per_character", None
),
output_cost_per_token_above_128k_tokens=_model_info.get(
"output_cost_per_token_above_128k_tokens", None
),
output_cost_per_character_above_128k_tokens=_model_info.get(
"output_cost_per_character_above_128k_tokens", None
),
litellm_provider=_model_info.get(
"litellm_provider", custom_llm_provider
),
mode=_model_info.get("mode"),
supported_openai_params=supported_openai_params,
supports_system_messages=_model_info.get(
"supports_system_messages", None
),
)
else:
raise ValueError(
"This model isn't mapped yet. Add it here - https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json"
)
## PROVIDER-SPECIFIC INFORMATION
if custom_llm_provider == "predibase":
_model_info["supports_response_schema"] = True
return ModelInfo(
max_tokens=_model_info.get("max_tokens", None),
max_input_tokens=_model_info.get("max_input_tokens", None),
max_output_tokens=_model_info.get("max_output_tokens", None),
input_cost_per_token=_model_info.get("input_cost_per_token", 0),
input_cost_per_character=_model_info.get(
"input_cost_per_character", None
),
input_cost_per_token_above_128k_tokens=_model_info.get(
"input_cost_per_token_above_128k_tokens", None
),
output_cost_per_token=_model_info.get("output_cost_per_token", 0),
output_cost_per_character=_model_info.get(
"output_cost_per_character", None
),
output_cost_per_token_above_128k_tokens=_model_info.get(
"output_cost_per_token_above_128k_tokens", None
),
output_cost_per_character_above_128k_tokens=_model_info.get(
"output_cost_per_character_above_128k_tokens", None
),
litellm_provider=_model_info.get(
"litellm_provider", custom_llm_provider
),
mode=_model_info.get("mode"),
supported_openai_params=supported_openai_params,
supports_system_messages=_model_info.get(
"supports_system_messages", None
),
supports_response_schema=_model_info.get(
"supports_response_schema", None
),
)
except Exception:
raise Exception(
"This model isn't mapped yet. Add it here - https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json"
@ -4746,6 +4817,12 @@ def function_to_dict(input_function): # noqa: C901
return result
def modify_url(original_url, new_path):
url = httpx.URL(original_url)
modified_url = url.copy_with(path=new_path)
return str(modified_url)
def load_test_model(
model: str,
custom_llm_provider: str = "",
@ -4975,6 +5052,11 @@ def validate_environment(model: Optional[str] = None) -> dict:
keys_in_environment = True
else:
missing_keys.append("NVIDIA_NIM_API_KEY")
elif custom_llm_provider == "volcengine":
if "VOLCENGINE_API_KEY" in os.environ:
keys_in_environment = True
else:
missing_keys.append("VOLCENGINE_API_KEY")
elif (
custom_llm_provider == "codestral"
or custom_llm_provider == "text-completion-codestral"
@ -5263,6 +5345,27 @@ def convert_to_model_response_object(
hidden_params: Optional[dict] = None,
):
received_args = locals()
### CHECK IF ERROR IN RESPONSE ### - openrouter returns these in the dictionary
if (
response_object is not None
and "error" in response_object
and response_object["error"] is not None
):
error_args = {"status_code": 422, "message": "Error in response object"}
if isinstance(response_object["error"], dict):
if "code" in response_object["error"]:
error_args["status_code"] = response_object["error"]["code"]
if "message" in response_object["error"]:
if isinstance(response_object["error"]["message"], dict):
message_str = json.dumps(response_object["error"]["message"])
else:
message_str = str(response_object["error"]["message"])
error_args["message"] = message_str
raised_exception = Exception()
setattr(raised_exception, "status_code", error_args["status_code"])
setattr(raised_exception, "message", error_args["message"])
raise raised_exception
try:
if response_type == "completion" and (
model_response_object is None
@ -5718,7 +5821,10 @@ def exception_type(
print() # noqa
try:
if model:
error_str = str(original_exception)
if hasattr(original_exception, "message"):
error_str = str(original_exception.message)
else:
error_str = str(original_exception)
if isinstance(original_exception, BaseException):
exception_type = type(original_exception).__name__
else:
@ -5740,6 +5846,18 @@ def exception_type(
_model_group = _metadata.get("model_group")
_deployment = _metadata.get("deployment")
extra_information = f"\nModel: {model}"
exception_provider = "Unknown"
if (
isinstance(custom_llm_provider, str)
and len(custom_llm_provider) > 0
):
exception_provider = (
custom_llm_provider[0].upper()
+ custom_llm_provider[1:]
+ "Exception"
)
if _api_base:
extra_information += f"\nAPI Base: `{_api_base}`"
if (
@ -5790,10 +5908,13 @@ def exception_type(
or custom_llm_provider in litellm.openai_compatible_providers
):
# custom_llm_provider is openai, make it OpenAI
if hasattr(original_exception, "message"):
message = original_exception.message
else:
message = str(original_exception)
message = get_error_message(error_obj=original_exception)
if message is None:
if hasattr(original_exception, "message"):
message = original_exception.message
else:
message = str(original_exception)
if message is not None and isinstance(message, str):
message = message.replace("OPENAI", custom_llm_provider.upper())
message = message.replace("openai", custom_llm_provider)
@ -6126,7 +6247,6 @@ def exception_type(
)
elif (
original_exception.status_code == 400
or original_exception.status_code == 422
or original_exception.status_code == 413
):
exception_mapping_worked = True
@ -6136,6 +6256,14 @@ def exception_type(
llm_provider="replicate",
response=original_exception.response,
)
elif original_exception.status_code == 422:
exception_mapping_worked = True
raise UnprocessableEntityError(
message=f"ReplicateException - {original_exception.message}",
model=model,
llm_provider="replicate",
response=original_exception.response,
)
elif original_exception.status_code == 408:
exception_mapping_worked = True
raise Timeout(
@ -7239,10 +7367,17 @@ def exception_type(
request=original_exception.request,
)
elif custom_llm_provider == "azure":
message = get_error_message(error_obj=original_exception)
if message is None:
if hasattr(original_exception, "message"):
message = original_exception.message
else:
message = str(original_exception)
if "Internal server error" in error_str:
exception_mapping_worked = True
raise litellm.InternalServerError(
message=f"AzureException Internal server error - {original_exception.message}",
message=f"AzureException Internal server error - {message}",
llm_provider="azure",
model=model,
litellm_debug_info=extra_information,
@ -7255,7 +7390,7 @@ def exception_type(
elif "This model's maximum context length is" in error_str:
exception_mapping_worked = True
raise ContextWindowExceededError(
message=f"AzureException ContextWindowExceededError - {original_exception.message}",
message=f"AzureException ContextWindowExceededError - {message}",
llm_provider="azure",
model=model,
litellm_debug_info=extra_information,
@ -7264,7 +7399,7 @@ def exception_type(
elif "DeploymentNotFound" in error_str:
exception_mapping_worked = True
raise NotFoundError(
message=f"AzureException NotFoundError - {original_exception.message}",
message=f"AzureException NotFoundError - {message}",
llm_provider="azure",
model=model,
litellm_debug_info=extra_information,
@ -7284,7 +7419,7 @@ def exception_type(
):
exception_mapping_worked = True
raise ContentPolicyViolationError(
message=f"litellm.ContentPolicyViolationError: AzureException - {original_exception.message}",
message=f"litellm.ContentPolicyViolationError: AzureException - {message}",
llm_provider="azure",
model=model,
litellm_debug_info=extra_information,
@ -7293,7 +7428,7 @@ def exception_type(
elif "invalid_request_error" in error_str:
exception_mapping_worked = True
raise BadRequestError(
message=f"AzureException BadRequestError - {original_exception.message}",
message=f"AzureException BadRequestError - {message}",
llm_provider="azure",
model=model,
litellm_debug_info=extra_information,
@ -7305,7 +7440,7 @@ def exception_type(
):
exception_mapping_worked = True
raise AuthenticationError(
message=f"{exception_provider} AuthenticationError - {original_exception.message}",
message=f"{exception_provider} AuthenticationError - {message}",
llm_provider=custom_llm_provider,
model=model,
litellm_debug_info=extra_information,
@ -7316,7 +7451,7 @@ def exception_type(
if original_exception.status_code == 400:
exception_mapping_worked = True
raise BadRequestError(
message=f"AzureException - {original_exception.message}",
message=f"AzureException - {message}",
llm_provider="azure",
model=model,
litellm_debug_info=extra_information,
@ -7325,7 +7460,7 @@ def exception_type(
elif original_exception.status_code == 401:
exception_mapping_worked = True
raise AuthenticationError(
message=f"AzureException AuthenticationError - {original_exception.message}",
message=f"AzureException AuthenticationError - {message}",
llm_provider="azure",
model=model,
litellm_debug_info=extra_information,
@ -7334,7 +7469,7 @@ def exception_type(
elif original_exception.status_code == 408:
exception_mapping_worked = True
raise Timeout(
message=f"AzureException Timeout - {original_exception.message}",
message=f"AzureException Timeout - {message}",
model=model,
litellm_debug_info=extra_information,
llm_provider="azure",
@ -7342,7 +7477,7 @@ def exception_type(
elif original_exception.status_code == 422:
exception_mapping_worked = True
raise BadRequestError(
message=f"AzureException BadRequestError - {original_exception.message}",
message=f"AzureException BadRequestError - {message}",
model=model,
llm_provider="azure",
litellm_debug_info=extra_information,
@ -7351,7 +7486,7 @@ def exception_type(
elif original_exception.status_code == 429:
exception_mapping_worked = True
raise RateLimitError(
message=f"AzureException RateLimitError - {original_exception.message}",
message=f"AzureException RateLimitError - {message}",
model=model,
llm_provider="azure",
litellm_debug_info=extra_information,
@ -7360,7 +7495,7 @@ def exception_type(
elif original_exception.status_code == 503:
exception_mapping_worked = True
raise ServiceUnavailableError(
message=f"AzureException ServiceUnavailableError - {original_exception.message}",
message=f"AzureException ServiceUnavailableError - {message}",
model=model,
llm_provider="azure",
litellm_debug_info=extra_information,
@ -7369,7 +7504,7 @@ def exception_type(
elif original_exception.status_code == 504: # gateway timeout error
exception_mapping_worked = True
raise Timeout(
message=f"AzureException Timeout - {original_exception.message}",
message=f"AzureException Timeout - {message}",
model=model,
litellm_debug_info=extra_information,
llm_provider="azure",
@ -7378,7 +7513,7 @@ def exception_type(
exception_mapping_worked = True
raise APIError(
status_code=original_exception.status_code,
message=f"AzureException APIError - {original_exception.message}",
message=f"AzureException APIError - {message}",
llm_provider="azure",
litellm_debug_info=extra_information,
model=model,
@ -7810,6 +7945,7 @@ class CustomStreamWrapper:
"<s>",
"</s>",
"<|im_end|>",
"<|im_start|>",
]
self.holding_chunk = ""
self.complete_response = ""