From 9b7465a222ee3c1873382144bcc15000109f6ff6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sebasti=C3=A1n=20Est=C3=A9vez?= Date: Wed, 15 May 2024 17:06:54 -0400 Subject: [PATCH 01/35] Another dictionary changed size during iteration error ``` ImportError while loading conftest '/astra-assistants-api/tests/openai-sdk/conftest.py'. conftest.py:13: in from impl.astra_vector import CassandraClient ../../impl/astra_vector.py:45: in from impl.services.inference_utils import get_embeddings ../../impl/services/inference_utils.py:5: in import litellm .cache/pypoetry/virtualenvs/astra-assistants-api-eiSmbCzm-py3.10/lib/python3.10/site-packages/litellm/__init__.py:678: in from .main import * # type: ignore .cache/pypoetry/virtualenvs/astra-assistants-api-eiSmbCzm-py3.10/lib/python3.10/site-packages/litellm/main.py:73: in from .llms.azure_text import AzureTextCompletion .cache/pypoetry/virtualenvs/astra-assistants-api-eiSmbCzm-py3.10/lib/python3.10/site-packages/litellm/llms/azure_text.py:23: in openai_text_completion_config = OpenAITextCompletionConfig() .cache/pypoetry/virtualenvs/astra-assistants-api-eiSmbCzm-py3.10/lib/python3.10/site-packages/litellm/llms/openai.py:192: in __init__ for key, value in locals_.items(): E RuntimeError: dictionary changed size during iteration ``` --- litellm/llms/openai.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/litellm/llms/openai.py b/litellm/llms/openai.py index 7acbdfae0..b8fde6e4f 100644 --- a/litellm/llms/openai.py +++ b/litellm/llms/openai.py @@ -96,7 +96,7 @@ class MistralConfig: safe_prompt: Optional[bool] = None, response_format: Optional[dict] = None, ) -> None: - locals_ = locals() + locals_ = locals().copy() for key, value in locals_.items(): if key != "self" and value is not None: setattr(self.__class__, key, value) @@ -211,7 +211,7 @@ class OpenAIConfig: temperature: Optional[int] = None, top_p: Optional[int] = None, ) -> None: - locals_ = locals() + locals_ = locals().copy() for key, value in locals_.items(): if key != "self" and value is not None: setattr(self.__class__, key, value) @@ -294,7 +294,7 @@ class OpenAITextCompletionConfig: temperature: Optional[float] = None, top_p: Optional[float] = None, ) -> None: - locals_ = locals() + locals_ = locals().copy() for key, value in locals_.items(): if key != "self" and value is not None: setattr(self.__class__, key, value) From 6368d5a7254134f50e5855cf20db50659ffe6740 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Fri, 17 May 2024 18:50:33 -0700 Subject: [PATCH 02/35] feat - read cooldown time from exception header --- litellm/router.py | 38 +++++++++++++++++++++++++++++++++----- 1 file changed, 33 insertions(+), 5 deletions(-) diff --git a/litellm/router.py b/litellm/router.py index 6400ff64e..80f1f900c 100644 --- a/litellm/router.py +++ b/litellm/router.py @@ -1923,10 +1923,28 @@ class Router: metadata = kwargs.get("litellm_params", {}).get("metadata", None) _model_info = kwargs.get("litellm_params", {}).get("model_info", {}) + exception_response = getattr(exception, "response", {}) + exception_headers = getattr(exception_response, "headers", None) + _time_to_cooldown = self.cooldown_time + + if exception_headers is not None: + + _time_to_cooldown = ( + litellm.utils._get_retry_after_from_exception_header( + response_headers=exception_headers + ) + ) + + if _time_to_cooldown < 0: + # if the response headers did not read it -> set to default cooldown time + _time_to_cooldown = self.cooldown_time + if isinstance(_model_info, dict): deployment_id = _model_info.get("id", None) self._set_cooldown_deployments( - exception_status=exception_status, deployment=deployment_id + exception_status=exception_status, + deployment=deployment_id, + time_to_cooldown=_time_to_cooldown, ) # setting deployment_id in cooldown deployments if custom_llm_provider: model_name = f"{custom_llm_provider}/{model_name}" @@ -2026,7 +2044,10 @@ class Router: return True def _set_cooldown_deployments( - self, exception_status: Union[str, int], deployment: Optional[str] = None + self, + exception_status: Union[str, int], + deployment: Optional[str] = None, + time_to_cooldown: Optional[float] = None, ): """ Add a model to the list of models being cooled down for that minute, if it exceeds the allowed fails / minute @@ -2053,6 +2074,8 @@ class Router: f"Attempting to add {deployment} to cooldown list. updated_fails: {updated_fails}; self.allowed_fails: {self.allowed_fails}" ) cooldown_time = self.cooldown_time or 1 + if time_to_cooldown is not None: + cooldown_time = time_to_cooldown if isinstance(exception_status, str): try: @@ -2090,7 +2113,9 @@ class Router: ) self.send_deployment_cooldown_alert( - deployment_id=deployment, exception_status=exception_status + deployment_id=deployment, + exception_status=exception_status, + cooldown_time=cooldown_time, ) else: self.failed_calls.set_cache( @@ -3751,7 +3776,10 @@ class Router: print("\033[94m\nInitialized Alerting for litellm.Router\033[0m\n") # noqa def send_deployment_cooldown_alert( - self, deployment_id: str, exception_status: Union[str, int] + self, + deployment_id: str, + exception_status: Union[str, int], + cooldown_time: float, ): try: from litellm.proxy.proxy_server import proxy_logging_obj @@ -3775,7 +3803,7 @@ class Router: ) asyncio.create_task( proxy_logging_obj.slack_alerting_instance.send_alert( - message=f"Router: Cooling down Deployment:\nModel Name: {_model_name}\nAPI Base: {_api_base}\n{self.cooldown_time} seconds. Got exception: {str(exception_status)}. Change 'cooldown_time' + 'allowed_fails' under 'Router Settings' on proxy UI, or via config - https://docs.litellm.ai/docs/proxy/reliability#fallbacks--retries--timeouts--cooldowns", + message=f"Router: Cooling down Deployment:\nModel Name: `{_model_name}`\nAPI Base: `{_api_base}`\nCooldown Time: `{cooldown_time} seconds`\nGot exception: `{str(exception_status)}`\nChange 'cooldown_time' + 'allowed_fails' under 'Router Settings' on proxy UI, or via config - https://docs.litellm.ai/docs/proxy/reliability#fallbacks--retries--timeouts--cooldowns", alert_type="cooldown_deployment", level="Low", ) From f7a16753378e8c5b49de4214f3111756a3375507 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Fri, 17 May 2024 18:52:45 -0700 Subject: [PATCH 03/35] fix - cooldown based on exception header --- litellm/utils.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/litellm/utils.py b/litellm/utils.py index 5d5c2b69c..5f48d60b8 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -8008,11 +8008,8 @@ def _should_retry(status_code: int): return False -def _calculate_retry_after( - remaining_retries: int, - max_retries: int, +def _get_retry_after_from_exception_header( response_headers: Optional[httpx.Headers] = None, - min_timeout: int = 0, ): """ Reimplementation of openai's calculate retry after, since that one can't be imported. @@ -8038,10 +8035,20 @@ def _calculate_retry_after( retry_after = int(retry_date - time.time()) else: retry_after = -1 + return retry_after - except Exception: + except Exception as e: retry_after = -1 + +def _calculate_retry_after( + remaining_retries: int, + max_retries: int, + response_headers: Optional[httpx.Headers] = None, + min_timeout: int = 0, +): + retry_after = _get_retry_after_from_exception_header(response_headers) + # If the API asks us to wait a certain amount of time (and it's a reasonable amount), just do what it says. if 0 < retry_after <= 60: return retry_after From 5ba5f15b56f411eabcec83b6f34f3274aa7971ff Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Mon, 20 May 2024 08:14:43 -0700 Subject: [PATCH 04/35] test - test_aimage_generation_vertex_ai --- litellm/tests/test_image_generation.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/litellm/tests/test_image_generation.py b/litellm/tests/test_image_generation.py index 82068a115..37e560b24 100644 --- a/litellm/tests/test_image_generation.py +++ b/litellm/tests/test_image_generation.py @@ -169,3 +169,22 @@ async def test_aimage_generation_bedrock_with_optional_params(): pass else: pytest.fail(f"An exception occurred - {str(e)}") + + +@pytest.mark.asyncio +async def test_aimage_generation_vertex_ai(): + try: + response = await litellm.aimage_generation( + prompt="A cute baby sea otter", + model="vertex_ai/imagegeneration@006", + ) + print(f"response: {response}") + except litellm.RateLimitError as e: + pass + except litellm.ContentPolicyViolationError: + pass # Azure randomly raises these errors - skip when they occur + except Exception as e: + if "Your task failed as a result of our safety system." in str(e): + pass + else: + pytest.fail(f"An exception occurred - {str(e)}") From 24951d44a4d9061fd638b619918a8a6c17718b03 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Mon, 20 May 2024 09:51:15 -0700 Subject: [PATCH 05/35] feat - working httpx requests vertex ai image gen --- litellm/llms/vertex_httpx.py | 156 +++++++++++++++++++++++++ litellm/main.py | 31 +++++ litellm/tests/test_image_generation.py | 2 +- 3 files changed, 188 insertions(+), 1 deletion(-) create mode 100644 litellm/llms/vertex_httpx.py diff --git a/litellm/llms/vertex_httpx.py b/litellm/llms/vertex_httpx.py new file mode 100644 index 000000000..ca850674b --- /dev/null +++ b/litellm/llms/vertex_httpx.py @@ -0,0 +1,156 @@ +import os, types +import json +from enum import Enum +import requests # type: ignore +import time +from typing import Callable, Optional, Union, List +from litellm.utils import ModelResponse, Usage, CustomStreamWrapper, map_finish_reason +import litellm, uuid +import httpx, inspect # type: ignore +from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler +from .base import BaseLLM + + +class VertexAIError(Exception): + def __init__(self, status_code, message): + self.status_code = status_code + self.message = message + self.request = httpx.Request( + method="POST", url=" https://cloud.google.com/vertex-ai/" + ) + self.response = httpx.Response(status_code=status_code, request=self.request) + super().__init__( + self.message + ) # Call the base class constructor with the parameters it needs + + +class VertexLLM(BaseLLM): + from google.auth.credentials import Credentials # type: ignore[import-untyped] + + def __init__(self) -> None: + from google.auth.credentials import Credentials # type: ignore[import-untyped] + + super().__init__() + self.access_token: Optional[str] = None + self.refresh_token: Optional[str] = None + self._credentials: Optional[Credentials] = None + self.project_id: Optional[str] = None + + def load_auth(self) -> tuple[Credentials, str]: + from google.auth.transport.requests import Request # type: ignore[import-untyped] + from google.auth.credentials import Credentials # type: ignore[import-untyped] + import google.auth as google_auth + + credentials, project_id = google_auth.default( + scopes=["https://www.googleapis.com/auth/cloud-platform"], + ) + + credentials.refresh(Request()) + + if not project_id: + raise ValueError("Could not resolve project_id") + + if not isinstance(project_id, str): + raise TypeError( + f"Expected project_id to be a str but got {type(project_id)}" + ) + + return credentials, project_id + + def refresh_auth(self, credentials: Credentials) -> None: + from google.auth.transport.requests import Request # type: ignore[import-untyped] + + credentials.refresh(Request()) + + def _prepare_request(self, request: httpx.Request) -> None: + access_token = self._ensure_access_token() + + if request.headers.get("Authorization"): + # already authenticated, nothing for us to do + return + + request.headers["Authorization"] = f"Bearer {access_token}" + + def _ensure_access_token(self) -> str: + if self.access_token is not None: + return self.access_token + + if not self._credentials: + self._credentials, project_id = self.load_auth() + if not self.project_id: + self.project_id = project_id + else: + self.refresh_auth(self._credentials) + + if not self._credentials.token: + raise RuntimeError("Could not resolve API token from the environment") + + assert isinstance(self._credentials.token, str) + return self._credentials.token + + async def aimage_generation( + self, + prompt: str, + vertex_project: str, + vertex_location: str, + model: Optional[ + str + ] = "imagegeneration", # vertex ai uses imagegeneration as the default model + client: Optional[AsyncHTTPHandler] = None, + optional_params: Optional[dict] = None, + timeout: Optional[int] = None, + logging_obj=None, + model_response=None, + ): + response = None + if client is None: + _params = {} + if timeout is not None: + if isinstance(timeout, float) or isinstance(timeout, int): + _httpx_timeout = httpx.Timeout(timeout) + _params["timeout"] = _httpx_timeout + client = AsyncHTTPHandler(**_params) # type: ignore + else: + client = client # type: ignore + + # make POST request to + # https://us-central1-aiplatform.googleapis.com/v1/projects/PROJECT_ID/locations/us-central1/publishers/google/models/imagegeneration:predict + url = f"https://{vertex_location}-aiplatform.googleapis.com/v1/projects/{vertex_project}/locations/{vertex_location}/publishers/google/models/{model}:predict" + + """ + Docs link: https://console.cloud.google.com/vertex-ai/publishers/google/model-garden/imagegeneration?project=adroit-crow-413218 + curl -X POST \ + -H "Authorization: Bearer $(gcloud auth print-access-token)" \ + -H "Content-Type: application/json; charset=utf-8" \ + -d { + "instances": [ + { + "prompt": "a cat" + } + ] + } \ + "https://us-central1-aiplatform.googleapis.com/v1/projects/PROJECT_ID/locations/us-central1/publishers/google/models/imagegeneration:predict" + """ + + import vertexai + + auth_header = self._ensure_access_token() + + request_data = { + "instances": [{"prompt": prompt}], + "parameters": {"sampleCount": 1}, + } + + response = await client.post( + url=url, + headers={ + "Content-Type": "application/json; charset=utf-8", + "Authorization": f"Bearer {auth_header}", + }, + data=json.dumps(request_data), + ) + + if response.status_code != 200: + raise Exception(f"Error: {response.status_code} {response.text}") + + return model_response diff --git a/litellm/main.py b/litellm/main.py index 14fd5439f..198e191fd 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -79,6 +79,7 @@ from .llms.anthropic_text import AnthropicTextCompletion from .llms.huggingface_restapi import Huggingface from .llms.predibase import PredibaseChatCompletion from .llms.bedrock_httpx import BedrockLLM +from .llms.vertex_httpx import VertexLLM from .llms.triton import TritonChatCompletion from .llms.prompt_templates.factory import ( prompt_factory, @@ -118,6 +119,7 @@ huggingface = Huggingface() predibase_chat_completions = PredibaseChatCompletion() triton_chat_completions = TritonChatCompletion() bedrock_chat_completion = BedrockLLM() +vertex_chat_completion = VertexLLM() ####### COMPLETION ENDPOINTS ################ @@ -3854,6 +3856,35 @@ def image_generation( model_response=model_response, aimg_generation=aimg_generation, ) + elif custom_llm_provider == "vertex_ai": + vertex_ai_project = ( + optional_params.pop("vertex_project", None) + or optional_params.pop("vertex_ai_project", None) + or litellm.vertex_project + or get_secret("VERTEXAI_PROJECT") + ) + vertex_ai_location = ( + optional_params.pop("vertex_location", None) + or optional_params.pop("vertex_ai_location", None) + or litellm.vertex_location + or get_secret("VERTEXAI_LOCATION") + ) + vertex_credentials = ( + optional_params.pop("vertex_credentials", None) + or optional_params.pop("vertex_ai_credentials", None) + or get_secret("VERTEXAI_CREDENTIALS") + ) + model_response = vertex_chat_completion.aimage_generation( # type: ignore + model=model, + prompt=prompt, + timeout=timeout, + logging_obj=litellm_logging_obj, + optional_params=optional_params, + model_response=model_response, + vertex_project=vertex_ai_project, + vertex_location=vertex_ai_location, + ) + return model_response except Exception as e: ## Map to OpenAI Exception diff --git a/litellm/tests/test_image_generation.py b/litellm/tests/test_image_generation.py index 37e560b24..3de3ba763 100644 --- a/litellm/tests/test_image_generation.py +++ b/litellm/tests/test_image_generation.py @@ -175,7 +175,7 @@ async def test_aimage_generation_bedrock_with_optional_params(): async def test_aimage_generation_vertex_ai(): try: response = await litellm.aimage_generation( - prompt="A cute baby sea otter", + prompt="An olympic size swimming pool", model="vertex_ai/imagegeneration@006", ) print(f"response: {response}") From a4f906b464e134fbb49fab7b1efced1268c22ec6 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Mon, 20 May 2024 10:09:41 -0700 Subject: [PATCH 06/35] feat - add litellm.ImageResponse --- litellm/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/litellm/__init__.py b/litellm/__init__.py index ac2b420d7..83e30d775 100644 --- a/litellm/__init__.py +++ b/litellm/__init__.py @@ -724,6 +724,8 @@ from .utils import ( get_supported_openai_params, get_api_base, get_first_chars_messages, + ModelResponse, + ImageResponse, ) from .llms.huggingface_restapi import HuggingfaceConfig from .llms.anthropic import AnthropicConfig From 372323c38a5ba849b51837f2eb80e89d74f2a696 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Mon, 20 May 2024 10:30:23 -0700 Subject: [PATCH 07/35] feat(proxy_server.py): allow admin to return rejected response as string to user Closes https://github.com/BerriAI/litellm/issues/3671 --- litellm/integrations/custom_logger.py | 15 +++- litellm/proxy/proxy_server.py | 35 ++++++++- litellm/proxy/utils.py | 109 +++++++++++++++++++++++++- litellm/utils.py | 26 ++++++ 4 files changed, 175 insertions(+), 10 deletions(-) diff --git a/litellm/integrations/custom_logger.py b/litellm/integrations/custom_logger.py index d50882592..accb4f80f 100644 --- a/litellm/integrations/custom_logger.py +++ b/litellm/integrations/custom_logger.py @@ -4,7 +4,7 @@ import dotenv, os from litellm.proxy._types import UserAPIKeyAuth from litellm.caching import DualCache - +from litellm.utils import ModelResponse from typing import Literal, Union, Optional import traceback @@ -64,8 +64,17 @@ class CustomLogger: # https://docs.litellm.ai/docs/observability/custom_callbac user_api_key_dict: UserAPIKeyAuth, cache: DualCache, data: dict, - call_type: Literal["completion", "embeddings", "image_generation"], - ): + call_type: Literal[ + "completion", + "text_completion", + "embeddings", + "image_generation", + "moderation", + "audio_transcription", + ], + ) -> Optional[ + Union[Exception, str, dict] + ]: # raise exception if invalid, return a str for the user to receive - if rejected, or return a modified dictionary for passing into litellm pass async def async_post_call_failure_hook( diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py index ef5380d08..6ad3e598a 100644 --- a/litellm/proxy/proxy_server.py +++ b/litellm/proxy/proxy_server.py @@ -3761,11 +3761,24 @@ async def chat_completion( data["litellm_logging_obj"] = logging_obj - ### CALL HOOKS ### - modify incoming data before calling the model - data = await proxy_logging_obj.pre_call_hook( + ### CALL HOOKS ### - modify/reject incoming data before calling the model + data = await proxy_logging_obj.pre_call_hook( # type: ignore user_api_key_dict=user_api_key_dict, data=data, call_type="completion" ) + if isinstance(data, litellm.ModelResponse): + return data + elif isinstance(data, litellm.CustomStreamWrapper): + selected_data_generator = select_data_generator( + response=data, + user_api_key_dict=user_api_key_dict, + request_data={}, + ) + + return StreamingResponse( + selected_data_generator, + media_type="text/event-stream", + ) tasks = [] tasks.append( proxy_logging_obj.during_call_hook( @@ -3998,10 +4011,24 @@ async def completion( data["model"] = litellm.model_alias_map[data["model"]] ### CALL HOOKS ### - modify incoming data before calling the model - data = await proxy_logging_obj.pre_call_hook( - user_api_key_dict=user_api_key_dict, data=data, call_type="completion" + data = await proxy_logging_obj.pre_call_hook( # type: ignore + user_api_key_dict=user_api_key_dict, data=data, call_type="text_completion" ) + if isinstance(data, litellm.TextCompletionResponse): + return data + elif isinstance(data, litellm.TextCompletionStreamWrapper): + selected_data_generator = select_data_generator( + response=data, + user_api_key_dict=user_api_key_dict, + request_data={}, + ) + + return StreamingResponse( + selected_data_generator, + media_type="text/event-stream", + ) + ### ROUTE THE REQUESTs ### router_model_names = llm_router.model_names if llm_router is not None else [] # skip router if user passed their key diff --git a/litellm/proxy/utils.py b/litellm/proxy/utils.py index 09e772e10..fc49ebc7f 100644 --- a/litellm/proxy/utils.py +++ b/litellm/proxy/utils.py @@ -19,7 +19,16 @@ from litellm.proxy.hooks.parallel_request_limiter import ( _PROXY_MaxParallelRequestsHandler, ) from litellm._service_logger import ServiceLogging, ServiceTypes -from litellm import ModelResponse, EmbeddingResponse, ImageResponse +from litellm import ( + ModelResponse, + EmbeddingResponse, + ImageResponse, + TranscriptionResponse, + TextCompletionResponse, + CustomStreamWrapper, + TextCompletionStreamWrapper, +) +from litellm.utils import ModelResponseIterator from litellm.proxy.hooks.max_budget_limiter import _PROXY_MaxBudgetLimiter from litellm.proxy.hooks.tpm_rpm_limiter import _PROXY_MaxTPMRPMLimiter from litellm.proxy.hooks.cache_control_check import _PROXY_CacheControlCheck @@ -32,6 +41,7 @@ from email.mime.text import MIMEText from email.mime.multipart import MIMEMultipart from datetime import datetime, timedelta from litellm.integrations.slack_alerting import SlackAlerting +from typing_extensions import overload def print_verbose(print_statement): @@ -176,18 +186,60 @@ class ProxyLogging: ) litellm.utils.set_callbacks(callback_list=callback_list) + # fmt: off + + @overload + async def pre_call_hook( + self, + user_api_key_dict: UserAPIKeyAuth, + data: dict, + call_type: Literal["completion"] + ) -> Union[dict, ModelResponse, CustomStreamWrapper]: + ... + + @overload + async def pre_call_hook( + self, + user_api_key_dict: UserAPIKeyAuth, + data: dict, + call_type: Literal["text_completion"] + ) -> Union[dict, TextCompletionResponse, TextCompletionStreamWrapper]: + ... + + @overload + async def pre_call_hook( + self, + user_api_key_dict: UserAPIKeyAuth, + data: dict, + call_type: Literal["embeddings", + "image_generation", + "moderation", + "audio_transcription",] + ) -> dict: + ... + + # fmt: on + + # The actual implementation of the function async def pre_call_hook( self, user_api_key_dict: UserAPIKeyAuth, data: dict, call_type: Literal[ "completion", + "text_completion", "embeddings", "image_generation", "moderation", "audio_transcription", ], - ): + ) -> Union[ + dict, + ModelResponse, + TextCompletionResponse, + CustomStreamWrapper, + TextCompletionStreamWrapper, + ]: """ Allows users to modify/reject the incoming request to the proxy, without having to deal with parsing Request body. @@ -214,7 +266,58 @@ class ProxyLogging: call_type=call_type, ) if response is not None: - data = response + if isinstance(response, Exception): + raise response + elif isinstance(response, dict): + data = response + elif isinstance(response, str): + if call_type == "completion": + _chat_response = ModelResponse() + _chat_response.choices[0].message.content = response + + if ( + data.get("stream", None) is not None + and data["stream"] == True + ): + _iterator = ModelResponseIterator( + model_response=_chat_response + ) + return CustomStreamWrapper( + completion_stream=_iterator, + model=data.get("model", ""), + custom_llm_provider="cached_response", + logging_obj=data.get( + "litellm_logging_obj", None + ), + ) + return _response + elif call_type == "text_completion": + if ( + data.get("stream", None) is not None + and data["stream"] == True + ): + _chat_response = ModelResponse() + _chat_response.choices[0].message.content = response + + if ( + data.get("stream", None) is not None + and data["stream"] == True + ): + _iterator = ModelResponseIterator( + model_response=_chat_response + ) + return TextCompletionStreamWrapper( + completion_stream=_iterator, + model=data.get("model", ""), + ) + else: + _response = TextCompletionResponse() + _response.choices[0].text = response + return _response + else: + raise HTTPException( + status_code=400, detail={"error": response} + ) print_verbose(f"final data being sent to {call_type} call: {data}") return data diff --git a/litellm/utils.py b/litellm/utils.py index ac246fca6..1e0485755 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -12187,3 +12187,29 @@ def _add_key_name_and_team_to_alert(request_info: str, metadata: dict) -> str: return request_info except: return request_info + + +class ModelResponseIterator: + def __init__(self, model_response): + self.model_response = model_response + self.is_done = False + + # Sync iterator + def __iter__(self): + return self + + def __next__(self): + if self.is_done: + raise StopIteration + self.is_done = True + return self.model_response + + # Async iterator + def __aiter__(self): + return self + + async def __anext__(self): + if self.is_done: + raise StopAsyncIteration + self.is_done = True + return self.model_response From 2519879e67ceac6cc926b2bdeb2d4d3d7bc9d7dc Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Mon, 20 May 2024 10:45:37 -0700 Subject: [PATCH 08/35] add ImageObject --- litellm/__init__.py | 1 + litellm/llms/vertex_httpx.py | 57 +++++++++++++++++++++++++- litellm/main.py | 3 +- litellm/tests/test_image_generation.py | 8 +++- litellm/utils.py | 46 ++++++++++++++++++++- 5 files changed, 111 insertions(+), 4 deletions(-) diff --git a/litellm/__init__.py b/litellm/__init__.py index 83e30d775..92610afd9 100644 --- a/litellm/__init__.py +++ b/litellm/__init__.py @@ -726,6 +726,7 @@ from .utils import ( get_first_chars_messages, ModelResponse, ImageResponse, + ImageObject, ) from .llms.huggingface_restapi import HuggingfaceConfig from .llms.anthropic import AnthropicConfig diff --git a/litellm/llms/vertex_httpx.py b/litellm/llms/vertex_httpx.py index ca850674b..0e16c02e7 100644 --- a/litellm/llms/vertex_httpx.py +++ b/litellm/llms/vertex_httpx.py @@ -88,7 +88,7 @@ class VertexLLM(BaseLLM): assert isinstance(self._credentials.token, str) return self._credentials.token - async def aimage_generation( + def image_generation( self, prompt: str, vertex_project: str, @@ -101,6 +101,35 @@ class VertexLLM(BaseLLM): timeout: Optional[int] = None, logging_obj=None, model_response=None, + aimg_generation=False, + ): + if aimg_generation == True: + response = self.aimage_generation( + prompt=prompt, + vertex_project=vertex_project, + vertex_location=vertex_location, + model=model, + client=client, + optional_params=optional_params, + timeout=timeout, + logging_obj=logging_obj, + model_response=model_response, + ) + return response + + async def aimage_generation( + self, + prompt: str, + vertex_project: str, + vertex_location: str, + model_response: litellm.ImageResponse, + model: Optional[ + str + ] = "imagegeneration", # vertex ai uses imagegeneration as the default model + client: Optional[AsyncHTTPHandler] = None, + optional_params: Optional[dict] = None, + timeout: Optional[int] = None, + logging_obj=None, ): response = None if client is None: @@ -152,5 +181,31 @@ class VertexLLM(BaseLLM): if response.status_code != 200: raise Exception(f"Error: {response.status_code} {response.text}") + """ + Vertex AI Image generation response example: + { + "predictions": [ + { + "bytesBase64Encoded": "BASE64_IMG_BYTES", + "mimeType": "image/png" + }, + { + "mimeType": "image/png", + "bytesBase64Encoded": "BASE64_IMG_BYTES" + } + ] + } + """ + + _json_response = response.json() + _predictions = _json_response["predictions"] + + _response_data: List[litellm.ImageObject] = [] + for _prediction in _predictions: + _bytes_base64_encoded = _prediction["bytesBase64Encoded"] + image_object = litellm.ImageObject(b64_json=_bytes_base64_encoded) + _response_data.append(image_object) + + model_response.data = _response_data return model_response diff --git a/litellm/main.py b/litellm/main.py index 198e191fd..7601d98a2 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -3874,7 +3874,7 @@ def image_generation( or optional_params.pop("vertex_ai_credentials", None) or get_secret("VERTEXAI_CREDENTIALS") ) - model_response = vertex_chat_completion.aimage_generation( # type: ignore + model_response = vertex_chat_completion.image_generation( model=model, prompt=prompt, timeout=timeout, @@ -3883,6 +3883,7 @@ def image_generation( model_response=model_response, vertex_project=vertex_ai_project, vertex_location=vertex_ai_location, + aimg_generation=aimg_generation, ) return model_response diff --git a/litellm/tests/test_image_generation.py b/litellm/tests/test_image_generation.py index 3de3ba763..886953c1a 100644 --- a/litellm/tests/test_image_generation.py +++ b/litellm/tests/test_image_generation.py @@ -178,7 +178,13 @@ async def test_aimage_generation_vertex_ai(): prompt="An olympic size swimming pool", model="vertex_ai/imagegeneration@006", ) - print(f"response: {response}") + assert response.data is not None + assert len(response.data) > 0 + + for d in response.data: + assert isinstance(d, litellm.ImageObject) + print("data in response.data", d) + assert d.b64_json is not None except litellm.RateLimitError as e: pass except litellm.ContentPolicyViolationError: diff --git a/litellm/utils.py b/litellm/utils.py index 6d0231e8f..b4a2bd618 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -965,10 +965,54 @@ class TextCompletionResponse(OpenAIObject): setattr(self, key, value) +class ImageObject(OpenAIObject): + """ + Represents the url or the content of an image generated by the OpenAI API. + + Attributes: + b64_json: The base64-encoded JSON of the generated image, if response_format is b64_json. + url: The URL of the generated image, if response_format is url (default). + revised_prompt: The prompt that was used to generate the image, if there was any revision to the prompt. + + https://platform.openai.com/docs/api-reference/images/object + """ + + b64_json: Optional[str] = None + url: Optional[str] = None + revised_prompt: Optional[str] = None + + def __init__(self, b64_json=None, url=None, revised_prompt=None): + + super().__init__(b64_json=b64_json, url=url, revised_prompt=revised_prompt) + + def __contains__(self, key): + # Define custom behavior for the 'in' operator + return hasattr(self, key) + + def get(self, key, default=None): + # Custom .get() method to access attributes with a default value if the attribute doesn't exist + return getattr(self, key, default) + + def __getitem__(self, key): + # Allow dictionary-style access to attributes + return getattr(self, key) + + def __setitem__(self, key, value): + # Allow dictionary-style assignment of attributes + setattr(self, key, value) + + def json(self, **kwargs): + try: + return self.model_dump() # noqa + except: + # if using pydantic v1 + return self.dict() + + class ImageResponse(OpenAIObject): created: Optional[int] = None - data: Optional[list] = None + data: Optional[list[ImageObject]] = None usage: Optional[dict] = None From 2da89a0c8e9b6ceb1ca23eb0b14791c16bab75ec Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Mon, 20 May 2024 10:51:25 -0700 Subject: [PATCH 09/35] fix vertex test --- litellm/tests/test_image_generation.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/litellm/tests/test_image_generation.py b/litellm/tests/test_image_generation.py index 886953c1a..6acb28e5b 100644 --- a/litellm/tests/test_image_generation.py +++ b/litellm/tests/test_image_generation.py @@ -173,6 +173,9 @@ async def test_aimage_generation_bedrock_with_optional_params(): @pytest.mark.asyncio async def test_aimage_generation_vertex_ai(): + from test_amazing_vertex_completion import load_vertex_ai_credentials + + load_vertex_ai_credentials() try: response = await litellm.aimage_generation( prompt="An olympic size swimming pool", From 655478e8dcf2982349bc072b95386a83568f869c Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Mon, 20 May 2024 10:55:10 -0700 Subject: [PATCH 10/35] fix python3.8 error --- litellm/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/litellm/utils.py b/litellm/utils.py index b4a2bd618..3dac33e56 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -1012,7 +1012,7 @@ class ImageObject(OpenAIObject): class ImageResponse(OpenAIObject): created: Optional[int] = None - data: Optional[list[ImageObject]] = None + data: Optional[List[ImageObject]] = None usage: Optional[dict] = None From d50d552e5a238a988b0c7369a4a576318016f518 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Mon, 20 May 2024 11:03:28 -0700 Subject: [PATCH 11/35] fix python 3.8 import --- litellm/llms/vertex_httpx.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/litellm/llms/vertex_httpx.py b/litellm/llms/vertex_httpx.py index 0e16c02e7..f0b2cfcb3 100644 --- a/litellm/llms/vertex_httpx.py +++ b/litellm/llms/vertex_httpx.py @@ -3,7 +3,7 @@ import json from enum import Enum import requests # type: ignore import time -from typing import Callable, Optional, Union, List +from typing import Callable, Optional, Union, List, Any from litellm.utils import ModelResponse, Usage, CustomStreamWrapper, map_finish_reason import litellm, uuid import httpx, inspect # type: ignore @@ -25,8 +25,6 @@ class VertexAIError(Exception): class VertexLLM(BaseLLM): - from google.auth.credentials import Credentials # type: ignore[import-untyped] - def __init__(self) -> None: from google.auth.credentials import Credentials # type: ignore[import-untyped] @@ -36,7 +34,7 @@ class VertexLLM(BaseLLM): self._credentials: Optional[Credentials] = None self.project_id: Optional[str] = None - def load_auth(self) -> tuple[Credentials, str]: + def load_auth(self) -> tuple[Any, str]: from google.auth.transport.requests import Request # type: ignore[import-untyped] from google.auth.credentials import Credentials # type: ignore[import-untyped] import google.auth as google_auth @@ -57,7 +55,7 @@ class VertexLLM(BaseLLM): return credentials, project_id - def refresh_auth(self, credentials: Credentials) -> None: + def refresh_auth(self, credentials: Any) -> None: from google.auth.transport.requests import Request # type: ignore[import-untyped] credentials.refresh(Request()) From 91f8443381d152b6b29c4955e00a4e51d505e79e Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Mon, 20 May 2024 11:11:14 -0700 Subject: [PATCH 12/35] fix add debug to vertex httpx image --- litellm/llms/vertex_httpx.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/litellm/llms/vertex_httpx.py b/litellm/llms/vertex_httpx.py index f0b2cfcb3..4fb554fe4 100644 --- a/litellm/llms/vertex_httpx.py +++ b/litellm/llms/vertex_httpx.py @@ -168,6 +168,16 @@ class VertexLLM(BaseLLM): "parameters": {"sampleCount": 1}, } + request_str = f"\n curl -X POST \\\n -H \"Authorization: Bearer {auth_header[:10] + 'XXXXXXXXXX'}\" \\\n -H \"Content-Type: application/json; charset=utf-8\" \\\n -d {request_data} \\\n \"{url}\"" + logging_obj.pre_call( + input=prompt, + api_key=None, + additional_args={ + "complete_input_dict": optional_params, + "request_str": request_str, + }, + ) + response = await client.post( url=url, headers={ From 6ddc9873e5fbf36ed0ea9360cf39650c72ee1b04 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Mon, 20 May 2024 11:14:16 -0700 Subject: [PATCH 13/35] test vertex image gen test --- litellm/tests/test_image_generation.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/litellm/tests/test_image_generation.py b/litellm/tests/test_image_generation.py index 6acb28e5b..4a5a8dac9 100644 --- a/litellm/tests/test_image_generation.py +++ b/litellm/tests/test_image_generation.py @@ -175,11 +175,14 @@ async def test_aimage_generation_bedrock_with_optional_params(): async def test_aimage_generation_vertex_ai(): from test_amazing_vertex_completion import load_vertex_ai_credentials + litellm.set_verbose = True + load_vertex_ai_credentials() try: response = await litellm.aimage_generation( prompt="An olympic size swimming pool", model="vertex_ai/imagegeneration@006", + vertex_ai_project="adroit-crow-413218", ) assert response.data is not None assert len(response.data) > 0 From f11f207ae6ba3a39a886634b612b06f591b6eaca Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Mon, 20 May 2024 11:14:36 -0700 Subject: [PATCH 14/35] feat(proxy_server.py): refactor returning rejected message, to work with error logging log the rejected request as a failed call to langfuse/slack alerting --- litellm/exceptions.py | 26 +++++ litellm/integrations/custom_logger.py | 1 - litellm/proxy/_super_secret_config.yaml | 6 +- litellm/proxy/_types.py | 4 + .../proxy/hooks/prompt_injection_detection.py | 8 ++ litellm/proxy/proxy_server.py | 76 ++++++++++++--- litellm/proxy/utils.py | 97 +++---------------- 7 files changed, 118 insertions(+), 100 deletions(-) diff --git a/litellm/exceptions.py b/litellm/exceptions.py index 5eb66743b..d189b7ebe 100644 --- a/litellm/exceptions.py +++ b/litellm/exceptions.py @@ -177,6 +177,32 @@ class ContextWindowExceededError(BadRequestError): # type: ignore ) # Call the base class constructor with the parameters it needs +# sub class of bad request error - meant to help us catch guardrails-related errors on proxy. +class RejectedRequestError(BadRequestError): # type: ignore + def __init__( + self, + message, + model, + llm_provider, + request_data: dict, + litellm_debug_info: Optional[str] = None, + ): + self.status_code = 400 + self.message = message + self.model = model + self.llm_provider = llm_provider + self.litellm_debug_info = litellm_debug_info + self.request_data = request_data + request = httpx.Request(method="POST", url="https://api.openai.com/v1") + response = httpx.Response(status_code=500, request=request) + super().__init__( + message=self.message, + model=self.model, # type: ignore + llm_provider=self.llm_provider, # type: ignore + response=response, + ) # Call the base class constructor with the parameters it needs + + class ContentPolicyViolationError(BadRequestError): # type: ignore # Error code: 400 - {'error': {'code': 'content_policy_violation', 'message': 'Your request was rejected as a result of our safety system. Image descriptions generated from your prompt may contain text that is not allowed by our safety system. If you believe this was done in error, your request may succeed if retried, or by adjusting your prompt.', 'param': None, 'type': 'invalid_request_error'}} def __init__( diff --git a/litellm/integrations/custom_logger.py b/litellm/integrations/custom_logger.py index accb4f80f..e192cdaea 100644 --- a/litellm/integrations/custom_logger.py +++ b/litellm/integrations/custom_logger.py @@ -4,7 +4,6 @@ import dotenv, os from litellm.proxy._types import UserAPIKeyAuth from litellm.caching import DualCache -from litellm.utils import ModelResponse from typing import Literal, Union, Optional import traceback diff --git a/litellm/proxy/_super_secret_config.yaml b/litellm/proxy/_super_secret_config.yaml index 2195a077d..42b36950b 100644 --- a/litellm/proxy/_super_secret_config.yaml +++ b/litellm/proxy/_super_secret_config.yaml @@ -17,4 +17,8 @@ model_list: api_key: os.environ/AZURE_API_KEY # The `os.environ/` prefix tells litellm to read this from the env. See https://docs.litellm.ai/docs/simple_proxy#load-api-keys-from-vault router_settings: - enable_pre_call_checks: true \ No newline at end of file + enable_pre_call_checks: true + +litellm_settings: + callbacks: ["detect_prompt_injection"] + diff --git a/litellm/proxy/_types.py b/litellm/proxy/_types.py index b900b623b..492c222fe 100644 --- a/litellm/proxy/_types.py +++ b/litellm/proxy/_types.py @@ -251,6 +251,10 @@ class LiteLLMPromptInjectionParams(LiteLLMBase): llm_api_name: Optional[str] = None llm_api_system_prompt: Optional[str] = None llm_api_fail_call_string: Optional[str] = None + reject_as_response: Optional[bool] = Field( + default=False, + description="Return rejected request error message as a string to the user. Default behaviour is to raise an exception.", + ) @root_validator(pre=True) def check_llm_api_params(cls, values): diff --git a/litellm/proxy/hooks/prompt_injection_detection.py b/litellm/proxy/hooks/prompt_injection_detection.py index 896046e94..87cae71a8 100644 --- a/litellm/proxy/hooks/prompt_injection_detection.py +++ b/litellm/proxy/hooks/prompt_injection_detection.py @@ -146,6 +146,7 @@ class _OPTIONAL_PromptInjectionDetection(CustomLogger): try: assert call_type in [ "completion", + "text_completion", "embeddings", "image_generation", "moderation", @@ -192,6 +193,13 @@ class _OPTIONAL_PromptInjectionDetection(CustomLogger): return data except HTTPException as e: + if ( + e.status_code == 400 + and isinstance(e.detail, dict) + and "error" in e.detail + ): + if self.prompt_injection_params.reject_as_response: + return e.detail["error"] raise e except Exception as e: traceback.print_exc() diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py index 6ad3e598a..6b395e138 100644 --- a/litellm/proxy/proxy_server.py +++ b/litellm/proxy/proxy_server.py @@ -124,6 +124,7 @@ from litellm.proxy.auth.auth_checks import ( get_actual_routes, ) from litellm.llms.custom_httpx.httpx_handler import HTTPHandler +from litellm.exceptions import RejectedRequestError try: from litellm._version import version @@ -3766,19 +3767,6 @@ async def chat_completion( user_api_key_dict=user_api_key_dict, data=data, call_type="completion" ) - if isinstance(data, litellm.ModelResponse): - return data - elif isinstance(data, litellm.CustomStreamWrapper): - selected_data_generator = select_data_generator( - response=data, - user_api_key_dict=user_api_key_dict, - request_data={}, - ) - - return StreamingResponse( - selected_data_generator, - media_type="text/event-stream", - ) tasks = [] tasks.append( proxy_logging_obj.during_call_hook( @@ -3893,6 +3881,40 @@ async def chat_completion( ) return response + except RejectedRequestError as e: + _data = e.request_data + _data["litellm_status"] = "fail" # used for alerting + await proxy_logging_obj.post_call_failure_hook( + user_api_key_dict=user_api_key_dict, + original_exception=e, + request_data=_data, + ) + _chat_response = litellm.ModelResponse() + _chat_response.choices[0].message.content = e.message # type: ignore + + if data.get("stream", None) is not None and data["stream"] == True: + _iterator = litellm.utils.ModelResponseIterator( + model_response=_chat_response + ) + _streaming_response = litellm.CustomStreamWrapper( + completion_stream=_iterator, + model=data.get("model", ""), + custom_llm_provider="cached_response", + logging_obj=data.get("litellm_logging_obj", None), + ) + selected_data_generator = select_data_generator( + response=e.message, + user_api_key_dict=user_api_key_dict, + request_data=_data, + ) + + return StreamingResponse( + selected_data_generator, + media_type="text/event-stream", + ) + _usage = litellm.Usage(prompt_tokens=0, completion_tokens=0, total_tokens=0) + _chat_response.usage = _usage # type: ignore + return _chat_response except Exception as e: data["litellm_status"] = "fail" # used for alerting traceback.print_exc() @@ -4112,6 +4134,34 @@ async def completion( ) return response + except RejectedRequestError as e: + _data = e.request_data + _data["litellm_status"] = "fail" # used for alerting + await proxy_logging_obj.post_call_failure_hook( + user_api_key_dict=user_api_key_dict, + original_exception=e, + request_data=_data, + ) + if _data.get("stream", None) is not None and _data["stream"] == True: + _chat_response = litellm.ModelResponse() + _usage = litellm.Usage( + prompt_tokens=0, + completion_tokens=0, + total_tokens=0, + ) + _chat_response.usage = _usage # type: ignore + _chat_response.choices[0].message.content = e.message # type: ignore + _iterator = litellm.utils.ModelResponseIterator( + model_response=_chat_response + ) + return litellm.TextCompletionStreamWrapper( + completion_stream=_iterator, + model=_data.get("model", ""), + ) + else: + _response = litellm.TextCompletionResponse() + _response.choices[0].text = e.message + return _response except Exception as e: data["litellm_status"] = "fail" # used for alerting await proxy_logging_obj.post_call_failure_hook( diff --git a/litellm/proxy/utils.py b/litellm/proxy/utils.py index fc49ebc7f..586b4c4cd 100644 --- a/litellm/proxy/utils.py +++ b/litellm/proxy/utils.py @@ -18,6 +18,7 @@ from litellm.llms.custom_httpx.httpx_handler import HTTPHandler from litellm.proxy.hooks.parallel_request_limiter import ( _PROXY_MaxParallelRequestsHandler, ) +from litellm.exceptions import RejectedRequestError from litellm._service_logger import ServiceLogging, ServiceTypes from litellm import ( ModelResponse, @@ -186,40 +187,6 @@ class ProxyLogging: ) litellm.utils.set_callbacks(callback_list=callback_list) - # fmt: off - - @overload - async def pre_call_hook( - self, - user_api_key_dict: UserAPIKeyAuth, - data: dict, - call_type: Literal["completion"] - ) -> Union[dict, ModelResponse, CustomStreamWrapper]: - ... - - @overload - async def pre_call_hook( - self, - user_api_key_dict: UserAPIKeyAuth, - data: dict, - call_type: Literal["text_completion"] - ) -> Union[dict, TextCompletionResponse, TextCompletionStreamWrapper]: - ... - - @overload - async def pre_call_hook( - self, - user_api_key_dict: UserAPIKeyAuth, - data: dict, - call_type: Literal["embeddings", - "image_generation", - "moderation", - "audio_transcription",] - ) -> dict: - ... - - # fmt: on - # The actual implementation of the function async def pre_call_hook( self, @@ -233,13 +200,7 @@ class ProxyLogging: "moderation", "audio_transcription", ], - ) -> Union[ - dict, - ModelResponse, - TextCompletionResponse, - CustomStreamWrapper, - TextCompletionStreamWrapper, - ]: + ) -> dict: """ Allows users to modify/reject the incoming request to the proxy, without having to deal with parsing Request body. @@ -271,54 +232,20 @@ class ProxyLogging: elif isinstance(response, dict): data = response elif isinstance(response, str): - if call_type == "completion": - _chat_response = ModelResponse() - _chat_response.choices[0].message.content = response - - if ( - data.get("stream", None) is not None - and data["stream"] == True - ): - _iterator = ModelResponseIterator( - model_response=_chat_response - ) - return CustomStreamWrapper( - completion_stream=_iterator, - model=data.get("model", ""), - custom_llm_provider="cached_response", - logging_obj=data.get( - "litellm_logging_obj", None - ), - ) - return _response - elif call_type == "text_completion": - if ( - data.get("stream", None) is not None - and data["stream"] == True - ): - _chat_response = ModelResponse() - _chat_response.choices[0].message.content = response - - if ( - data.get("stream", None) is not None - and data["stream"] == True - ): - _iterator = ModelResponseIterator( - model_response=_chat_response - ) - return TextCompletionStreamWrapper( - completion_stream=_iterator, - model=data.get("model", ""), - ) - else: - _response = TextCompletionResponse() - _response.choices[0].text = response - return _response + if ( + call_type == "completion" + or call_type == "text_completion" + ): + raise RejectedRequestError( + message=response, + model=data.get("model", ""), + llm_provider="", + request_data=data, + ) else: raise HTTPException( status_code=400, detail={"error": response} ) - print_verbose(f"final data being sent to {call_type} call: {data}") return data except Exception as e: From 571d4cf569f6f3320c3daca6e9c5f8e5b80f5181 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Mon, 20 May 2024 12:10:28 -0700 Subject: [PATCH 15/35] test - fix test_aimage_generation_vertex_ai --- litellm/tests/test_image_generation.py | 1 + 1 file changed, 1 insertion(+) diff --git a/litellm/tests/test_image_generation.py b/litellm/tests/test_image_generation.py index 4a5a8dac9..9fe32544b 100644 --- a/litellm/tests/test_image_generation.py +++ b/litellm/tests/test_image_generation.py @@ -183,6 +183,7 @@ async def test_aimage_generation_vertex_ai(): prompt="An olympic size swimming pool", model="vertex_ai/imagegeneration@006", vertex_ai_project="adroit-crow-413218", + vertex_ai_location="us-central1", ) assert response.data is not None assert len(response.data) > 0 From aa0ed8238b8f76106ebca76f7d4ef7f2e27e1aa0 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Mon, 20 May 2024 12:18:31 -0700 Subject: [PATCH 16/35] docs - image generation vertex --- docs/my-website/docs/image_generation.md | 16 ++++++++++++++++ docs/my-website/docs/providers/vertex.md | 12 ++++++++++++ 2 files changed, 28 insertions(+) diff --git a/docs/my-website/docs/image_generation.md b/docs/my-website/docs/image_generation.md index 002d95c03..7bb4d2c99 100644 --- a/docs/my-website/docs/image_generation.md +++ b/docs/my-website/docs/image_generation.md @@ -150,4 +150,20 @@ response = image_generation( model="bedrock/stability.stable-diffusion-xl-v0", ) print(f"response: {response}") +``` + +## VertexAI - Image Generation Models + +### Usage + +Use this for image generation models on VertexAI + +```python +response = litellm.image_generation( + prompt="An olympic size swimming pool", + model="vertex_ai/imagegeneration@006", + vertex_ai_project="adroit-crow-413218", + vertex_ai_location="us-central1", +) +print(f"response: {response}") ``` \ No newline at end of file diff --git a/docs/my-website/docs/providers/vertex.md b/docs/my-website/docs/providers/vertex.md index b67eb350b..dc0ef48b4 100644 --- a/docs/my-website/docs/providers/vertex.md +++ b/docs/my-website/docs/providers/vertex.md @@ -508,6 +508,18 @@ All models listed [here](https://github.com/BerriAI/litellm/blob/57f37f743886a02 | text-embedding-preview-0409 | `embedding(model="vertex_ai/text-embedding-preview-0409", input)` | | text-multilingual-embedding-preview-0409 | `embedding(model="vertex_ai/text-multilingual-embedding-preview-0409", input)` | +## Image Generation Models + +Usage + +```python +response = await litellm.aimage_generation( + prompt="An olympic size swimming pool", + model="vertex_ai/imagegeneration@006", + vertex_ai_project="adroit-crow-413218", + vertex_ai_location="us-central1", +) +``` ## Extra From dabaf5f2977719fd49c657a126e0b87e05594785 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Mon, 20 May 2024 12:21:02 -0700 Subject: [PATCH 17/35] fix python 3.8 Tuple --- litellm/llms/vertex_httpx.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/litellm/llms/vertex_httpx.py b/litellm/llms/vertex_httpx.py index 4fb554fe4..9fc9080b0 100644 --- a/litellm/llms/vertex_httpx.py +++ b/litellm/llms/vertex_httpx.py @@ -3,7 +3,7 @@ import json from enum import Enum import requests # type: ignore import time -from typing import Callable, Optional, Union, List, Any +from typing import Callable, Optional, Union, List, Any, Tuple from litellm.utils import ModelResponse, Usage, CustomStreamWrapper, map_finish_reason import litellm, uuid import httpx, inspect # type: ignore @@ -34,7 +34,7 @@ class VertexLLM(BaseLLM): self._credentials: Optional[Credentials] = None self.project_id: Optional[str] = None - def load_auth(self) -> tuple[Any, str]: + def load_auth(self) -> Tuple[Any, str]: from google.auth.transport.requests import Request # type: ignore[import-untyped] from google.auth.credentials import Credentials # type: ignore[import-untyped] import google.auth as google_auth From 1fe3900800a4ffc0411b73b3ad66f23bc1fa67fe Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Mon, 20 May 2024 12:23:27 -0700 Subject: [PATCH 18/35] fix python 3.8 --- litellm/llms/vertex_httpx.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/litellm/llms/vertex_httpx.py b/litellm/llms/vertex_httpx.py index 9fc9080b0..61920d4e6 100644 --- a/litellm/llms/vertex_httpx.py +++ b/litellm/llms/vertex_httpx.py @@ -26,12 +26,10 @@ class VertexAIError(Exception): class VertexLLM(BaseLLM): def __init__(self) -> None: - from google.auth.credentials import Credentials # type: ignore[import-untyped] - super().__init__() self.access_token: Optional[str] = None self.refresh_token: Optional[str] = None - self._credentials: Optional[Credentials] = None + self._credentials: Optional[Any] = None self.project_id: Optional[str] = None def load_auth(self) -> Tuple[Any, str]: From b41f30ca6097548e2e305882f51ce07fc82677a0 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Mon, 20 May 2024 12:32:19 -0700 Subject: [PATCH 19/35] fix(proxy_server.py): fixes for making rejected responses work with streaming --- litellm/proxy/_super_secret_config.yaml | 4 +++ .../proxy/hooks/prompt_injection_detection.py | 6 ++-- litellm/proxy/proxy_server.py | 34 +++++++++---------- litellm/utils.py | 12 +++++-- 4 files changed, 34 insertions(+), 22 deletions(-) diff --git a/litellm/proxy/_super_secret_config.yaml b/litellm/proxy/_super_secret_config.yaml index 42b36950b..8db3eea3e 100644 --- a/litellm/proxy/_super_secret_config.yaml +++ b/litellm/proxy/_super_secret_config.yaml @@ -21,4 +21,8 @@ router_settings: litellm_settings: callbacks: ["detect_prompt_injection"] + prompt_injection_params: + heuristics_check: true + similarity_check: true + reject_as_response: true diff --git a/litellm/proxy/hooks/prompt_injection_detection.py b/litellm/proxy/hooks/prompt_injection_detection.py index 87cae71a8..08dbedd8c 100644 --- a/litellm/proxy/hooks/prompt_injection_detection.py +++ b/litellm/proxy/hooks/prompt_injection_detection.py @@ -193,13 +193,15 @@ class _OPTIONAL_PromptInjectionDetection(CustomLogger): return data except HTTPException as e: + if ( e.status_code == 400 and isinstance(e.detail, dict) and "error" in e.detail + and self.prompt_injection_params is not None + and self.prompt_injection_params.reject_as_response ): - if self.prompt_injection_params.reject_as_response: - return e.detail["error"] + return e.detail["error"] raise e except Exception as e: traceback.print_exc() diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py index 6b395e138..016db6ea3 100644 --- a/litellm/proxy/proxy_server.py +++ b/litellm/proxy/proxy_server.py @@ -3894,7 +3894,7 @@ async def chat_completion( if data.get("stream", None) is not None and data["stream"] == True: _iterator = litellm.utils.ModelResponseIterator( - model_response=_chat_response + model_response=_chat_response, convert_to_delta=True ) _streaming_response = litellm.CustomStreamWrapper( completion_stream=_iterator, @@ -3903,7 +3903,7 @@ async def chat_completion( logging_obj=data.get("litellm_logging_obj", None), ) selected_data_generator = select_data_generator( - response=e.message, + response=_streaming_response, user_api_key_dict=user_api_key_dict, request_data=_data, ) @@ -4037,20 +4037,6 @@ async def completion( user_api_key_dict=user_api_key_dict, data=data, call_type="text_completion" ) - if isinstance(data, litellm.TextCompletionResponse): - return data - elif isinstance(data, litellm.TextCompletionStreamWrapper): - selected_data_generator = select_data_generator( - response=data, - user_api_key_dict=user_api_key_dict, - request_data={}, - ) - - return StreamingResponse( - selected_data_generator, - media_type="text/event-stream", - ) - ### ROUTE THE REQUESTs ### router_model_names = llm_router.model_names if llm_router is not None else [] # skip router if user passed their key @@ -4152,12 +4138,24 @@ async def completion( _chat_response.usage = _usage # type: ignore _chat_response.choices[0].message.content = e.message # type: ignore _iterator = litellm.utils.ModelResponseIterator( - model_response=_chat_response + model_response=_chat_response, convert_to_delta=True ) - return litellm.TextCompletionStreamWrapper( + _streaming_response = litellm.TextCompletionStreamWrapper( completion_stream=_iterator, model=_data.get("model", ""), ) + + selected_data_generator = select_data_generator( + response=_streaming_response, + user_api_key_dict=user_api_key_dict, + request_data=data, + ) + + return StreamingResponse( + selected_data_generator, + media_type="text/event-stream", + headers={}, + ) else: _response = litellm.TextCompletionResponse() _response.choices[0].text = e.message diff --git a/litellm/utils.py b/litellm/utils.py index 1e0485755..5029e8c61 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -6440,6 +6440,7 @@ def get_formatted_prompt( "image_generation", "audio_transcription", "moderation", + "text_completion", ], ) -> str: """ @@ -6452,6 +6453,8 @@ def get_formatted_prompt( for m in data["messages"]: if "content" in m and isinstance(m["content"], str): prompt += m["content"] + elif call_type == "text_completion": + prompt = data["prompt"] elif call_type == "embedding" or call_type == "moderation": if isinstance(data["input"], str): prompt = data["input"] @@ -12190,8 +12193,13 @@ def _add_key_name_and_team_to_alert(request_info: str, metadata: dict) -> str: class ModelResponseIterator: - def __init__(self, model_response): - self.model_response = model_response + def __init__(self, model_response: ModelResponse, convert_to_delta: bool = False): + if convert_to_delta == True: + self.model_response = ModelResponse(stream=True) + _delta = self.model_response.choices[0].delta # type: ignore + _delta.content = model_response.choices[0].message.content # type: ignore + else: + self.model_response = model_response self.is_done = False # Sync iterator From bc3c06bc74bd0768fb0f8d258b1f54d4e5226626 Mon Sep 17 00:00:00 2001 From: Krrish Dholakia Date: Mon, 20 May 2024 12:45:03 -0700 Subject: [PATCH 20/35] docs(call_hooks.md): update docs --- docs/my-website/docs/proxy/call_hooks.md | 147 ++++++++++++++++++++--- 1 file changed, 131 insertions(+), 16 deletions(-) diff --git a/docs/my-website/docs/proxy/call_hooks.md b/docs/my-website/docs/proxy/call_hooks.md index 3195e2e5a..3a8726e87 100644 --- a/docs/my-website/docs/proxy/call_hooks.md +++ b/docs/my-website/docs/proxy/call_hooks.md @@ -25,26 +25,45 @@ class MyCustomHandler(CustomLogger): # https://docs.litellm.ai/docs/observabilit def __init__(self): pass - #### ASYNC #### - - async def async_log_stream_event(self, kwargs, response_obj, start_time, end_time): - pass - - async def async_log_pre_api_call(self, model, messages, kwargs): - pass - - async def async_log_success_event(self, kwargs, response_obj, start_time, end_time): - pass - - async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time): - pass - #### CALL HOOKS - proxy only #### - async def async_pre_call_hook(self, user_api_key_dict: UserAPIKeyAuth, cache: DualCache, data: dict, call_type: Literal["completion", "embeddings"]): + async def async_pre_call_hook(self, user_api_key_dict: UserAPIKeyAuth, cache: DualCache, data: dict, call_type: Literal[ + "completion", + "text_completion", + "embeddings", + "image_generation", + "moderation", + "audio_transcription", + ]) -> Optional[dict, str, Exception]: data["model"] = "my-new-model" return data + async def async_post_call_failure_hook( + self, original_exception: Exception, user_api_key_dict: UserAPIKeyAuth + ): + pass + + async def async_post_call_success_hook( + self, + user_api_key_dict: UserAPIKeyAuth, + response, + ): + pass + + async def async_moderation_hook( # call made in parallel to llm api call + self, + data: dict, + user_api_key_dict: UserAPIKeyAuth, + call_type: Literal["completion", "embeddings", "image_generation"], + ): + pass + + async def async_post_call_streaming_hook( + self, + user_api_key_dict: UserAPIKeyAuth, + response: str, + ): + pass proxy_handler_instance = MyCustomHandler() ``` @@ -190,4 +209,100 @@ general_settings: **Result** - \ No newline at end of file + + +## Advanced - Return rejected message as response + +For chat completions and text completion calls, you can return a rejected message as a user response. + +Do this by returning a string. LiteLLM takes care of returning the response in the correct format depending on the endpoint and if it's streaming/non-streaming. + +For non-chat/text completion endpoints, this response is returned as a 400 status code exception. + + +### 1. Create Custom Handler + +```python +from litellm.integrations.custom_logger import CustomLogger +import litellm +from litellm.utils import get_formatted_prompt + +# This file includes the custom callbacks for LiteLLM Proxy +# Once defined, these can be passed in proxy_config.yaml +class MyCustomHandler(CustomLogger): + def __init__(self): + pass + + #### CALL HOOKS - proxy only #### + + async def async_pre_call_hook(self, user_api_key_dict: UserAPIKeyAuth, cache: DualCache, data: dict, call_type: Literal[ + "completion", + "text_completion", + "embeddings", + "image_generation", + "moderation", + "audio_transcription", + ]) -> Optional[dict, str, Exception]: + formatted_prompt = get_formatted_prompt(data=data, call_type=call_type) + + if "Hello world" in formatted_prompt: + return "This is an invalid response" + + return data + +proxy_handler_instance = MyCustomHandler() +``` + +### 2. Update config.yaml + +```yaml +model_list: + - model_name: gpt-3.5-turbo + litellm_params: + model: gpt-3.5-turbo + +litellm_settings: + callbacks: custom_callbacks.proxy_handler_instance # sets litellm.callbacks = [proxy_handler_instance] +``` + + +### 3. Test it! + +```shell +$ litellm /path/to/config.yaml +``` +```shell +curl --location 'http://0.0.0.0:4000/chat/completions' \ + --data ' { + "model": "gpt-3.5-turbo", + "messages": [ + { + "role": "user", + "content": "Hello world" + } + ], + }' +``` + +**Expected Response** + +``` +{ + "id": "chatcmpl-d00bbede-2d90-4618-bf7b-11a1c23cf360", + "choices": [ + { + "finish_reason": "stop", + "index": 0, + "message": { + "content": "This is an invalid response.", # 👈 REJECTED RESPONSE + "role": "assistant" + } + } + ], + "created": 1716234198, + "model": null, + "object": "chat.completion", + "system_fingerprint": null, + "usage": {} +} +``` \ No newline at end of file From 561b00283cccbb8cbc27aab196af6c7491cd5c41 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Mon, 20 May 2024 12:54:08 -0700 Subject: [PATCH 21/35] feat - enforce sso on Admin UI --- litellm/proxy/proxy_server.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py index 7e1e2646f..be638df23 100644 --- a/litellm/proxy/proxy_server.py +++ b/litellm/proxy/proxy_server.py @@ -8987,9 +8987,25 @@ async def google_login(request: Request): PROXY_BASE_URL should be the your deployed proxy endpoint, e.g. PROXY_BASE_URL="https://litellm-production-7002.up.railway.app/" Example: """ + global premium_user microsoft_client_id = os.getenv("MICROSOFT_CLIENT_ID", None) google_client_id = os.getenv("GOOGLE_CLIENT_ID", None) generic_client_id = os.getenv("GENERIC_CLIENT_ID", None) + + ####### Check if user is a Enterprise / Premium User ####### + if ( + microsoft_client_id is not None + or google_client_id is not None + or generic_client_id is not None + ): + if premium_user != True: + raise ProxyException( + message="You must be a LiteLLM Enterprise user to use SSO. Meet with us to get a license: https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat", + type="auth_error", + param="premium_user", + code=status.HTTP_403_FORBIDDEN, + ) + # get url from request redirect_url = os.getenv("PROXY_BASE_URL", str(request.base_url)) ui_username = os.getenv("UI_USERNAME") From d956020470270a6d5b40b5a3aac2ae6544b67af7 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Mon, 20 May 2024 13:02:56 -0700 Subject: [PATCH 22/35] fix error on enforce sso --- litellm/proxy/proxy_server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py index be638df23..746563ab7 100644 --- a/litellm/proxy/proxy_server.py +++ b/litellm/proxy/proxy_server.py @@ -9000,7 +9000,7 @@ async def google_login(request: Request): ): if premium_user != True: raise ProxyException( - message="You must be a LiteLLM Enterprise user to use SSO. Meet with us to get a license: https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat", + message="You must be a LiteLLM Enterprise user to use SSO. If you have a license please set `LITELLM_LICENSE` in your env. If you want to obtain a license meet with us here: https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat You are seeing this error message because You set one of `MICROSOFT_CLIENT_ID`, `GOOGLE_CLIENT_ID`, or `GENERIC_CLIENT_ID` in your env. Please unset this", type="auth_error", param="premium_user", code=status.HTTP_403_FORBIDDEN, From 11c9780ff05d2a4661749bdf98092a0cc956f4e5 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Mon, 20 May 2024 13:11:10 -0700 Subject: [PATCH 23/35] fix self.async_handler --- litellm/llms/vertex_httpx.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/litellm/llms/vertex_httpx.py b/litellm/llms/vertex_httpx.py index 61920d4e6..e7b31b155 100644 --- a/litellm/llms/vertex_httpx.py +++ b/litellm/llms/vertex_httpx.py @@ -31,6 +31,7 @@ class VertexLLM(BaseLLM): self.refresh_token: Optional[str] = None self._credentials: Optional[Any] = None self.project_id: Optional[str] = None + self.async_handler: Optional[AsyncHTTPHandler] = None def load_auth(self) -> Tuple[Any, str]: from google.auth.transport.requests import Request # type: ignore[import-untyped] @@ -134,9 +135,9 @@ class VertexLLM(BaseLLM): if isinstance(timeout, float) or isinstance(timeout, int): _httpx_timeout = httpx.Timeout(timeout) _params["timeout"] = _httpx_timeout - client = AsyncHTTPHandler(**_params) # type: ignore + self.async_handler = AsyncHTTPHandler(**_params) # type: ignore else: - client = client # type: ignore + self.async_handler = client # type: ignore # make POST request to # https://us-central1-aiplatform.googleapis.com/v1/projects/PROJECT_ID/locations/us-central1/publishers/google/models/imagegeneration:predict @@ -176,7 +177,7 @@ class VertexLLM(BaseLLM): }, ) - response = await client.post( + response = await self.async_handler.post( url=url, headers={ "Content-Type": "application/json; charset=utf-8", From 2c25bfa8dfa34298f8dffc482012e305447a9a8e Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Mon, 20 May 2024 13:13:19 -0700 Subject: [PATCH 24/35] fix vertex ai import --- litellm/llms/vertex_httpx.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/litellm/llms/vertex_httpx.py b/litellm/llms/vertex_httpx.py index e7b31b155..59ded6be0 100644 --- a/litellm/llms/vertex_httpx.py +++ b/litellm/llms/vertex_httpx.py @@ -157,9 +157,6 @@ class VertexLLM(BaseLLM): } \ "https://us-central1-aiplatform.googleapis.com/v1/projects/PROJECT_ID/locations/us-central1/publishers/google/models/imagegeneration:predict" """ - - import vertexai - auth_header = self._ensure_access_token() request_data = { From 518db139820010a209394ddfb3ab0e1e6370f34a Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Mon, 20 May 2024 13:28:20 -0700 Subject: [PATCH 25/35] add parameter mapping with vertex ai --- docs/my-website/docs/providers/vertex.md | 13 +++++++++++++ litellm/llms/vertex_httpx.py | 10 ++++++++-- litellm/tests/test_image_generation.py | 1 + litellm/utils.py | 8 ++++++++ 4 files changed, 30 insertions(+), 2 deletions(-) diff --git a/docs/my-website/docs/providers/vertex.md b/docs/my-website/docs/providers/vertex.md index dc0ef48b4..32c3ea188 100644 --- a/docs/my-website/docs/providers/vertex.md +++ b/docs/my-website/docs/providers/vertex.md @@ -521,6 +521,19 @@ response = await litellm.aimage_generation( ) ``` +**Generating multiple images** + +Use the `n` parameter to pass how many images you want generated +```python +response = await litellm.aimage_generation( + prompt="An olympic size swimming pool", + model="vertex_ai/imagegeneration@006", + vertex_ai_project="adroit-crow-413218", + vertex_ai_location="us-central1", + n=1, +) +``` + ## Extra ### Using `GOOGLE_APPLICATION_CREDENTIALS` diff --git a/litellm/llms/vertex_httpx.py b/litellm/llms/vertex_httpx.py index 59ded6be0..35a6b1d47 100644 --- a/litellm/llms/vertex_httpx.py +++ b/litellm/llms/vertex_httpx.py @@ -153,15 +153,21 @@ class VertexLLM(BaseLLM): { "prompt": "a cat" } - ] + ], + "parameters": { + "sampleCount": 1 + } } \ "https://us-central1-aiplatform.googleapis.com/v1/projects/PROJECT_ID/locations/us-central1/publishers/google/models/imagegeneration:predict" """ auth_header = self._ensure_access_token() + optional_params = optional_params or { + "sampleCount": 1 + } # default optional params request_data = { "instances": [{"prompt": prompt}], - "parameters": {"sampleCount": 1}, + "parameters": optional_params, } request_str = f"\n curl -X POST \\\n -H \"Authorization: Bearer {auth_header[:10] + 'XXXXXXXXXX'}\" \\\n -H \"Content-Type: application/json; charset=utf-8\" \\\n -d {request_data} \\\n \"{url}\"" diff --git a/litellm/tests/test_image_generation.py b/litellm/tests/test_image_generation.py index 9fe32544b..35f66ad47 100644 --- a/litellm/tests/test_image_generation.py +++ b/litellm/tests/test_image_generation.py @@ -184,6 +184,7 @@ async def test_aimage_generation_vertex_ai(): model="vertex_ai/imagegeneration@006", vertex_ai_project="adroit-crow-413218", vertex_ai_location="us-central1", + n=1, ) assert response.data is not None assert len(response.data) > 0 diff --git a/litellm/utils.py b/litellm/utils.py index 3dac33e56..19f7c9910 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -4946,6 +4946,14 @@ def get_optional_params_image_gen( width, height = size.split("x") optional_params["width"] = int(width) optional_params["height"] = int(height) + elif custom_llm_provider == "vertex_ai": + supported_params = ["n"] + """ + All params here: https://console.cloud.google.com/vertex-ai/publishers/google/model-garden/imagegeneration?project=adroit-crow-413218 + """ + _check_valid_arg(supported_params=supported_params) + if n is not None: + optional_params["sampleCount"] = int(n) for k in passed_params.keys(): if k not in default_params.keys(): From f3eb8325932467e37db4da9288ebdf11d5f44f65 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Mon, 20 May 2024 13:43:54 -0700 Subject: [PATCH 26/35] fix vertex httpx client --- litellm/llms/vertex_httpx.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/litellm/llms/vertex_httpx.py b/litellm/llms/vertex_httpx.py index 35a6b1d47..b8c698c90 100644 --- a/litellm/llms/vertex_httpx.py +++ b/litellm/llms/vertex_httpx.py @@ -135,6 +135,9 @@ class VertexLLM(BaseLLM): if isinstance(timeout, float) or isinstance(timeout, int): _httpx_timeout = httpx.Timeout(timeout) _params["timeout"] = _httpx_timeout + else: + _params["timeout"] = httpx.Timeout(timeout=600.0, connect=5.0) + self.async_handler = AsyncHTTPHandler(**_params) # type: ignore else: self.async_handler = client # type: ignore From 02fc507b01e9eaa30dec376d6cb6188a5c4aa105 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Mon, 20 May 2024 14:26:30 -0700 Subject: [PATCH 27/35] fix divide by 0 bug --- litellm/integrations/slack_alerting.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/litellm/integrations/slack_alerting.py b/litellm/integrations/slack_alerting.py index 227db079d..bd07e5dd7 100644 --- a/litellm/integrations/slack_alerting.py +++ b/litellm/integrations/slack_alerting.py @@ -875,7 +875,8 @@ Model Info: if isinstance(response_obj, litellm.ModelResponse): completion_tokens = response_obj.usage.completion_tokens - final_value = float(response_s.total_seconds() / completion_tokens) + if completion_tokens is not None and completion_tokens > 0: + final_value = float(response_s.total_seconds() / completion_tokens) await self.async_update_daily_reports( DeploymentMetrics( From f417495b7b062a4b846b4bd8f4827e0b7c9015bd Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Mon, 20 May 2024 14:59:06 -0700 Subject: [PATCH 28/35] fix - only adding alerting callbacks when alerting is on --- litellm/proxy/utils.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/litellm/proxy/utils.py b/litellm/proxy/utils.py index 09e772e10..1bafdd89e 100644 --- a/litellm/proxy/utils.py +++ b/litellm/proxy/utils.py @@ -131,7 +131,13 @@ class ProxyLogging: alerting_args=alerting_args, ) - if "daily_reports" in self.alert_types: + if ( + self.alerting is not None + and "slack" in self.alerting + and "daily_reports" in self.alert_types + ): + # NOTE: ENSURE we only add callbacks when alerting is on + # We should NOT add callbacks when alerting is off litellm.callbacks.append(self.slack_alerting_instance) # type: ignore if redis_cache is not None: From b5f8c6387535547165c74ccc9cd6fbbe8481f9ba Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Mon, 20 May 2024 15:03:04 -0700 Subject: [PATCH 29/35] try/except deployment metrics error --- litellm/integrations/slack_alerting.py | 49 +++++++++++++++----------- 1 file changed, 29 insertions(+), 20 deletions(-) diff --git a/litellm/integrations/slack_alerting.py b/litellm/integrations/slack_alerting.py index bd07e5dd7..a5ae97d41 100644 --- a/litellm/integrations/slack_alerting.py +++ b/litellm/integrations/slack_alerting.py @@ -864,28 +864,37 @@ Model Info: async def async_log_success_event(self, kwargs, response_obj, start_time, end_time): """Log deployment latency""" - if "daily_reports" in self.alert_types: - model_id = ( - kwargs.get("litellm_params", {}).get("model_info", {}).get("id", "") - ) - response_s: timedelta = end_time - start_time - - final_value = response_s - total_tokens = 0 - - if isinstance(response_obj, litellm.ModelResponse): - completion_tokens = response_obj.usage.completion_tokens - if completion_tokens is not None and completion_tokens > 0: - final_value = float(response_s.total_seconds() / completion_tokens) - - await self.async_update_daily_reports( - DeploymentMetrics( - id=model_id, - failed_request=False, - latency_per_output_token=final_value, - updated_at=litellm.utils.get_utc_datetime(), + try: + if "daily_reports" in self.alert_types: + model_id = ( + kwargs.get("litellm_params", {}).get("model_info", {}).get("id", "") ) + response_s: timedelta = end_time - start_time + + final_value = response_s + total_tokens = 0 + + if isinstance(response_obj, litellm.ModelResponse): + completion_tokens = response_obj.usage.completion_tokens + if completion_tokens is not None and completion_tokens > 0: + final_value = float( + response_s.total_seconds() / completion_tokens + ) + + await self.async_update_daily_reports( + DeploymentMetrics( + id=model_id, + failed_request=False, + latency_per_output_token=final_value, + updated_at=litellm.utils.get_utc_datetime(), + ) + ) + except Exception as e: + verbose_proxy_logger.error( + "[Non-Blocking Error] Slack Alerting: Got error in logging LLM deployment latency: ", + e, ) + pass async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time): """Log failure + deployment latency""" From 498bfa9a4cc7eeb00eec31af8be50689a68b4835 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Mon, 20 May 2024 15:43:29 -0700 Subject: [PATCH 30/35] fix - revert check_request_disconnection --- litellm/proxy/proxy_server.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py index 72427bd9f..351984c2b 100644 --- a/litellm/proxy/proxy_server.py +++ b/litellm/proxy/proxy_server.py @@ -3646,7 +3646,6 @@ async def chat_completion( ): global general_settings, user_debug, proxy_logging_obj, llm_model_list data = {} - check_request_disconnected = None try: body = await request.body() body_str = body.decode() @@ -3829,9 +3828,6 @@ async def chat_completion( *tasks ) # run the moderation check in parallel to the actual llm api call - check_request_disconnected = asyncio.create_task( - check_request_disconnection(request, llm_responses) - ) responses = await llm_responses response = responses[1] @@ -3913,9 +3909,6 @@ async def chat_completion( param=getattr(e, "param", "None"), code=getattr(e, "status_code", 500), ) - finally: - if check_request_disconnected is not None: - check_request_disconnected.cancel() @router.post( @@ -3942,7 +3935,6 @@ async def completion( ): global user_temperature, user_request_timeout, user_max_tokens, user_api_base data = {} - check_request_disconnected = None try: body = await request.body() body_str = body.decode() @@ -4042,9 +4034,6 @@ async def completion( + data.get("model", "") }, ) - check_request_disconnected = asyncio.create_task( - check_request_disconnection(request, llm_response) - ) # Await the llm_response task response = await llm_response @@ -4109,9 +4098,6 @@ async def completion( param=getattr(e, "param", "None"), code=getattr(e, "status_code", 500), ) - finally: - if check_request_disconnected is not None: - check_request_disconnected.cancel() @router.post( From 0ddaf320efad0f02baabaf98ff23ddbf8096fd91 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Mon, 20 May 2024 16:17:09 -0700 Subject: [PATCH 31/35] fix test - retry claude-3 image error 3 times --- litellm/tests/test_bedrock_completion.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/litellm/tests/test_bedrock_completion.py b/litellm/tests/test_bedrock_completion.py index b8b08003c..8d7f692dc 100644 --- a/litellm/tests/test_bedrock_completion.py +++ b/litellm/tests/test_bedrock_completion.py @@ -206,6 +206,7 @@ def test_completion_bedrock_claude_sts_client_auth(): # test_completion_bedrock_claude_sts_client_auth() + @pytest.mark.skip(reason="We don't have Circle CI OIDC credentials as yet") def test_completion_bedrock_claude_sts_oidc_auth(): print("\ncalling bedrock claude with oidc auth") @@ -244,7 +245,7 @@ def test_bedrock_extra_headers(): messages=messages, max_tokens=10, temperature=0.78, - extra_headers={"x-key": "x_key_value"} + extra_headers={"x-key": "x_key_value"}, ) # Add any assertions here to check the response assert len(response.choices) > 0 @@ -259,7 +260,7 @@ def test_bedrock_claude_3(): try: litellm.set_verbose = True data = { - "max_tokens": 2000, + "max_tokens": 100, "stream": False, "temperature": 0.3, "messages": [ @@ -282,6 +283,7 @@ def test_bedrock_claude_3(): } response: ModelResponse = completion( model="bedrock/anthropic.claude-3-sonnet-20240229-v1:0", + num_retries=3, # messages=messages, # max_tokens=10, # temperature=0.78, From 5d24b47d2b621153fab0e0936bc84bf53a3eef36 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Mon, 20 May 2024 16:23:48 -0700 Subject: [PATCH 32/35] fix standardize llm exception alert to msg: `value` --- litellm/proxy/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/litellm/proxy/utils.py b/litellm/proxy/utils.py index 09e772e10..3dac1563c 100644 --- a/litellm/proxy/utils.py +++ b/litellm/proxy/utils.py @@ -436,7 +436,7 @@ class ProxyLogging: asyncio.create_task( self.alerting_handler( - message=f"LLM API call failed: {exception_str}", + message=f"LLM API call failed: `{exception_str}`", level="High", alert_type="llm_exceptions", request_data=request_data, From 8263d15ca354291f2fb07de6a8603f60d8630886 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Mon, 20 May 2024 16:26:11 -0700 Subject: [PATCH 33/35] fix - standardize slack alerting format --- litellm/utils.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/litellm/utils.py b/litellm/utils.py index 8ac6b58d8..c12741a97 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -8315,18 +8315,18 @@ def exception_type( _deployment = _metadata.get("deployment") extra_information = f"\nModel: {model}" if _api_base: - extra_information += f"\nAPI Base: {_api_base}" + extra_information += f"\nAPI Base: `{_api_base}`" if messages and len(messages) > 0: - extra_information += f"\nMessages: {messages}" + extra_information += f"\nMessages: `{messages}`" if _model_group is not None: - extra_information += f"\nmodel_group: {_model_group}\n" + extra_information += f"\nmodel_group: `{_model_group}`\n" if _deployment is not None: - extra_information += f"\ndeployment: {_deployment}\n" + extra_information += f"\ndeployment: `{_deployment}`\n" if _vertex_project is not None: - extra_information += f"\nvertex_project: {_vertex_project}\n" + extra_information += f"\nvertex_project: `{_vertex_project}`\n" if _vertex_location is not None: - extra_information += f"\nvertex_location: {_vertex_location}\n" + extra_information += f"\nvertex_location: `{_vertex_location}`\n" # on litellm proxy add key name + team to exceptions extra_information = _add_key_name_and_team_to_alert( From f11de863f6553dab0c1ecf3e3d842856719cc318 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Mon, 20 May 2024 16:29:16 -0700 Subject: [PATCH 34/35] fix - standardize format of exceptions occuring on slack alerts --- litellm/router.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/litellm/router.py b/litellm/router.py index 6400ff64e..a45b9d396 100644 --- a/litellm/router.py +++ b/litellm/router.py @@ -3775,7 +3775,7 @@ class Router: ) asyncio.create_task( proxy_logging_obj.slack_alerting_instance.send_alert( - message=f"Router: Cooling down Deployment:\nModel Name: {_model_name}\nAPI Base: {_api_base}\n{self.cooldown_time} seconds. Got exception: {str(exception_status)}. Change 'cooldown_time' + 'allowed_fails' under 'Router Settings' on proxy UI, or via config - https://docs.litellm.ai/docs/proxy/reliability#fallbacks--retries--timeouts--cooldowns", + message=f"Router: Cooling down Deployment:\nModel Name: `{_model_name}`\nAPI Base: `{_api_base}`\nCooldown Time: `{self.cooldown_time}` seconds\nException Status Code: `{str(exception_status)}`\n\nChange 'cooldown_time' + 'allowed_fails' under 'Router Settings' on proxy UI, or via config - https://docs.litellm.ai/docs/proxy/reliability#fallbacks--retries--timeouts--cooldowns", alert_type="cooldown_deployment", level="Low", ) From f574127bc46d8e5f5f207787b002ab8bdb11d5ae Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Mon, 20 May 2024 17:14:03 -0700 Subject: [PATCH 35/35] fix - raise 404 when team does not exist --- litellm/proxy/proxy_server.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py index 351984c2b..a92a25997 100644 --- a/litellm/proxy/proxy_server.py +++ b/litellm/proxy/proxy_server.py @@ -7744,6 +7744,12 @@ async def team_info( team_info = await prisma_client.get_data( team_id=team_id, table_name="team", query_type="find_unique" ) + if team_info is None: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail={"message": f"Team not found, passed team id: {team_id}."}, + ) + ## GET ALL KEYS ## keys = await prisma_client.get_data( team_id=team_id,