diff --git a/cookbook/litellm_proxy_server/grafana_dashboard/grafana_dashboard.json b/cookbook/litellm_proxy_server/grafana_dashboard/dashboard_1/grafana_dashboard.json similarity index 100% rename from cookbook/litellm_proxy_server/grafana_dashboard/grafana_dashboard.json rename to cookbook/litellm_proxy_server/grafana_dashboard/dashboard_1/grafana_dashboard.json diff --git a/cookbook/litellm_proxy_server/grafana_dashboard/dashboard_1/readme.md b/cookbook/litellm_proxy_server/grafana_dashboard/dashboard_1/readme.md new file mode 100644 index 0000000000..1f193aba70 --- /dev/null +++ b/cookbook/litellm_proxy_server/grafana_dashboard/dashboard_1/readme.md @@ -0,0 +1,6 @@ +## This folder contains the `json` for creating the following Grafana Dashboard + +### Pre-Requisites +- Setup LiteLLM Proxy Prometheus Metrics https://docs.litellm.ai/docs/proxy/prometheus + +![1716623265684](https://github.com/BerriAI/litellm/assets/29436595/0e12c57e-4a2d-4850-bd4f-e4294f87a814) diff --git a/cookbook/litellm_proxy_server/grafana_dashboard/readme.md b/cookbook/litellm_proxy_server/grafana_dashboard/readme.md index 1f193aba70..fae1d792d2 100644 --- a/cookbook/litellm_proxy_server/grafana_dashboard/readme.md +++ b/cookbook/litellm_proxy_server/grafana_dashboard/readme.md @@ -1,6 +1,6 @@ -## This folder contains the `json` for creating the following Grafana Dashboard +## Contains example Grafana Dashboard made for LiteLLM Proxy Server + +This folder contains the `json` for creating Grafana Dashboards ### Pre-Requisites -- Setup LiteLLM Proxy Prometheus Metrics https://docs.litellm.ai/docs/proxy/prometheus - -![1716623265684](https://github.com/BerriAI/litellm/assets/29436595/0e12c57e-4a2d-4850-bd4f-e4294f87a814) +- Setup LiteLLM Proxy Prometheus Metrics https://docs.litellm.ai/docs/proxy/prometheus \ No newline at end of file diff --git a/docs/my-website/docs/completion/vision.md b/docs/my-website/docs/completion/vision.md index ea04b1e1e1..69af03c987 100644 --- a/docs/my-website/docs/completion/vision.md +++ b/docs/my-website/docs/completion/vision.md @@ -39,7 +39,7 @@ Use `litellm.supports_vision(model="")` -> returns `True` if model supports `vis ```python assert litellm.supports_vision(model="gpt-4-vision-preview") == True -assert litellm.supports_vision(model="gemini-1.0-pro-visionn") == True +assert litellm.supports_vision(model="gemini-1.0-pro-vision") == True assert litellm.supports_vision(model="gpt-3.5-turbo") == False ``` diff --git a/docs/my-website/docs/proxy/guardrails.md b/docs/my-website/docs/proxy/guardrails.md index 04c8602e9f..4c4d0c0e91 100644 --- a/docs/my-website/docs/proxy/guardrails.md +++ b/docs/my-website/docs/proxy/guardrails.md @@ -29,6 +29,9 @@ litellm_settings: - prompt_injection: # your custom name for guardrail callbacks: [lakera_prompt_injection] # litellm callbacks to use default_on: true # will run on all llm requests when true + - pii_masking: # your custom name for guardrail + callbacks: [presidio] # use the litellm presidio callback + default_on: false # by default this is off for all requests - hide_secrets_guard: callbacks: [hide_secrets] default_on: false @@ -37,6 +40,12 @@ litellm_settings: default_on: false ``` +:::info + +Since `pii_masking` is default Off for all requests, [you can switch it on per API Key](#switch-guardrails-onoff-per-api-key) + +::: + ### 2. Test it Run litellm proxy @@ -185,6 +194,85 @@ print(response) +## Switch Guardrails On/Off Per API Key + +❓ Use this when you need to switch guardrails on/off per API Key + +**Step 1** Create Key with `pii_masking` On + +**NOTE:** We defined `pii_masking` [on step 1](#1-setup-guardrails-on-litellm-proxy-configyaml) + +👉 Set `"permissions": {"pii_masking": true}` with either `/key/generate` or `/key/update` + +This means the `pii_masking` guardrail is on for all requests from this API Key + +:::info + +If you need to switch `pii_masking` off for an API Key set `"permissions": {"pii_masking": false}` with either `/key/generate` or `/key/update` + +::: + + + + + +```shell +curl --location 'http://0.0.0.0:4000/key/generate' \ + --header 'Authorization: Bearer sk-1234' \ + --header 'Content-Type: application/json' \ + --data '{ + "permissions": {"pii_masking": true} +}' +``` + +```shell +# {"permissions":{"pii_masking":true},"key":"sk-jNm1Zar7XfNdZXp49Z1kSQ"} +``` + + + + +```shell +curl --location 'http://0.0.0.0:4000/key/update' \ + --header 'Authorization: Bearer sk-1234' \ + --header 'Content-Type: application/json' \ + --data '{ + "key": "sk-jNm1Zar7XfNdZXp49Z1kSQ", + "permissions": {"pii_masking": true} +}' +``` + +```shell +# {"permissions":{"pii_masking":true},"key":"sk-jNm1Zar7XfNdZXp49Z1kSQ"} +``` + + + + +**Step 2** Test it with new key + +```shell +curl --location 'http://0.0.0.0:4000/chat/completions' \ + --header 'Authorization: Bearer sk-jNm1Zar7XfNdZXp49Z1kSQ' \ + --header 'Content-Type: application/json' \ + --data '{ + "model": "llama3", + "messages": [ + { + "role": "user", + "content": "does my phone number look correct - +1 412-612-9992" + } + ] +}' +``` + +Expect to NOT see `+1 412-612-9992` in your server logs on your callback. + +:::info +The `pii_masking` guardrail ran on this request because api key=sk-jNm1Zar7XfNdZXp49Z1kSQ has `"permissions": {"pii_masking": true}` +::: + + ## Spec for `guardrails` on litellm config @@ -208,9 +296,9 @@ litellm_settings: #### Guardrail: `prompt_injection`: Configuration for detecting and preventing prompt injection attacks. -- `callbacks`: List of LiteLLM callbacks used for this guardrail. [Can be one of `[lakera_prompt_injection, hide_secrets, llmguard_moderations, llamaguard_moderations, google_text_moderation]`](enterprise#content-moderation) +- `callbacks`: List of LiteLLM callbacks used for this guardrail. [Can be one of `[lakera_prompt_injection, hide_secrets, presidio, llmguard_moderations, llamaguard_moderations, google_text_moderation]`](enterprise#content-moderation) - `default_on`: Boolean flag determining if this guardrail runs on all LLM requests by default. #### Guardrail: `your-custom-guardrail`: Configuration for a user-defined custom guardrail. -- `callbacks`: List of callbacks for this custom guardrail. Can be one of `[lakera_prompt_injection, hide_secrets, llmguard_moderations, llamaguard_moderations, google_text_moderation]` +- `callbacks`: List of callbacks for this custom guardrail. Can be one of `[lakera_prompt_injection, hide_secrets, presidio, llmguard_moderations, llamaguard_moderations, google_text_moderation]` - `default_on`: Boolean flag determining if this custom guardrail runs by default, set to false. diff --git a/docs/my-website/docs/proxy/logging.md b/docs/my-website/docs/proxy/logging.md index 83bf8ee95d..c2f583366c 100644 --- a/docs/my-website/docs/proxy/logging.md +++ b/docs/my-website/docs/proxy/logging.md @@ -7,10 +7,13 @@ import TabItem from '@theme/TabItem'; Log Proxy Input, Output, Exceptions using Langfuse, OpenTelemetry, Custom Callbacks, DataDog, DynamoDB, s3 Bucket +## Table of Contents + - [Logging to Langfuse](#logging-proxy-inputoutput---langfuse) - [Logging with OpenTelemetry (OpenTelemetry)](#logging-proxy-inputoutput-in-opentelemetry-format) - [Async Custom Callbacks](#custom-callback-class-async) - [Async Custom Callback APIs](#custom-callback-apis-async) +- [Logging to Galileo](#logging-llm-io-to-galileo) - [Logging to OpenMeter](#logging-proxy-inputoutput---langfuse) - [Logging to s3 Buckets](#logging-proxy-inputoutput---s3-buckets) - [Logging to DataDog](#logging-proxy-inputoutput---datadog) @@ -1056,6 +1059,68 @@ litellm_settings: Start the LiteLLM Proxy and make a test request to verify the logs reached your callback API + +## Logging LLM IO to Galileo +[BETA] + +Log LLM I/O on [www.rungalileo.io](https://www.rungalileo.io/) + +:::info + +Beta Integration + +::: + +**Required Env Variables** + +```bash +export GALILEO_BASE_URL="" # For most users, this is the same as their console URL except with the word 'console' replaced by 'api' (e.g. http://www.console.galileo.myenterprise.com -> http://www.api.galileo.myenterprise.com) +export GALILEO_PROJECT_ID="" +export GALILEO_USERNAME="" +export GALILEO_PASSWORD="" +``` + +### Quick Start + +1. Add to Config.yaml +```yaml +model_list: +- litellm_params: + api_base: https://exampleopenaiendpoint-production.up.railway.app/ + api_key: my-fake-key + model: openai/my-fake-model + model_name: fake-openai-endpoint + +litellm_settings: + success_callback: ["galileo"] # 👈 KEY CHANGE +``` + +2. Start Proxy + +``` +litellm --config /path/to/config.yaml +``` + +3. Test it! + +```bash +curl --location 'http://0.0.0.0:4000/chat/completions' \ +--header 'Content-Type: application/json' \ +--data ' { + "model": "fake-openai-endpoint", + "messages": [ + { + "role": "user", + "content": "what llm are you" + } + ], + } +' +``` + + +🎉 That's it - Expect to see your Logs on your Galileo Dashboard + ## Logging Proxy Cost + Usage - OpenMeter Bill customers according to their LLM API usage with [OpenMeter](../observability/openmeter.md) diff --git a/docs/my-website/docs/proxy/prometheus.md b/docs/my-website/docs/proxy/prometheus.md index 6790b25b02..61d1397ac2 100644 --- a/docs/my-website/docs/proxy/prometheus.md +++ b/docs/my-website/docs/proxy/prometheus.md @@ -132,3 +132,9 @@ litellm_settings: | `litellm_redis_latency` | histogram latency for redis calls | | `litellm_redis_fails` | Number of failed redis calls | | `litellm_self_latency` | Histogram latency for successful litellm api call | + +## 🔥 Community Maintained Grafana Dashboards + +Link to Grafana Dashboards made by LiteLLM community + +https://github.com/BerriAI/litellm/tree/main/cookbook/litellm_proxy_server/grafana_dashboard \ No newline at end of file diff --git a/litellm/caching.py b/litellm/caching.py index 64488289a8..0812d8c6bb 100644 --- a/litellm/caching.py +++ b/litellm/caching.py @@ -248,9 +248,15 @@ class RedisCache(BaseCache): # asyncio.get_running_loop().create_task(self.ping()) result = asyncio.get_running_loop().create_task(self.ping()) except Exception as e: - verbose_logger.error( - "Error connecting to Async Redis client", extra={"error": str(e)} - ) + if "no running event loop" in str(e): + verbose_logger.debug( + "Ignoring async redis ping. No running event loop." + ) + else: + verbose_logger.error( + "Error connecting to Async Redis client - {}".format(str(e)), + extra={"error": str(e)}, + ) ### SYNC HEALTH PING ### try: diff --git a/litellm/integrations/galileo.py b/litellm/integrations/galileo.py new file mode 100644 index 0000000000..51d845fcb3 --- /dev/null +++ b/litellm/integrations/galileo.py @@ -0,0 +1,159 @@ +import os +from datetime import datetime +from typing import Any, Dict, List, Optional + +import httpx +from pydantic import BaseModel, Field + +import litellm +from litellm._logging import verbose_logger +from litellm.integrations.custom_logger import CustomLogger +from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler + + +# from here: https://docs.rungalileo.io/galileo/gen-ai-studio-products/galileo-observe/how-to/logging-data-via-restful-apis#structuring-your-records +class LLMResponse(BaseModel): + latency_ms: int + status_code: int + input_text: str + output_text: str + node_type: str + model: str + num_input_tokens: int + num_output_tokens: int + output_logprobs: Optional[Dict[str, Any]] = Field( + default=None, + description="Optional. When available, logprobs are used to compute Uncertainty.", + ) + created_at: str = Field( + ..., description='timestamp constructed in "%Y-%m-%dT%H:%M:%S" format' + ) + tags: Optional[List[str]] = None + user_metadata: Optional[Dict[str, Any]] = None + + +class GalileoObserve(CustomLogger): + def __init__(self) -> None: + self.in_memory_records: List[dict] = [] + self.batch_size = 1 + self.base_url = os.getenv("GALILEO_BASE_URL", None) + self.project_id = os.getenv("GALILEO_PROJECT_ID", None) + self.headers = None + self.async_httpx_handler = AsyncHTTPHandler( + timeout=httpx.Timeout(timeout=600.0, connect=5.0) + ) + pass + + def set_galileo_headers(self): + # following https://docs.rungalileo.io/galileo/gen-ai-studio-products/galileo-observe/how-to/logging-data-via-restful-apis#logging-your-records + + headers = { + "accept": "application/json", + "Content-Type": "application/x-www-form-urlencoded", + } + galileo_login_response = self.async_httpx_handler.post( + url=f"{self.base_url}/login", + headers=headers, + data={ + "username": os.getenv("GALILEO_USERNAME"), + "password": os.getenv("GALILEO_PASSWORD"), + }, + ) + + access_token = galileo_login_response.json()["access_token"] + + self.headers = { + "accept": "application/json", + "Content-Type": "application/json", + "Authorization": f"Bearer {access_token}", + } + + def get_output_str_from_response(self, response_obj, kwargs): + output = None + if response_obj is not None and ( + kwargs.get("call_type", None) == "embedding" + or isinstance(response_obj, litellm.EmbeddingResponse) + ): + output = None + elif response_obj is not None and isinstance( + response_obj, litellm.ModelResponse + ): + output = response_obj["choices"][0]["message"].json() + elif response_obj is not None and isinstance( + response_obj, litellm.TextCompletionResponse + ): + output = response_obj.choices[0].text + elif response_obj is not None and isinstance( + response_obj, litellm.ImageResponse + ): + output = response_obj["data"] + + return output + + async def async_log_success_event( + self, + kwargs, + start_time, + end_time, + response_obj, + ): + verbose_logger.debug(f"On Async Success") + + _latency_ms = int((end_time - start_time).total_seconds() * 1000) + _call_type = kwargs.get("call_type", "litellm") + input_text = litellm.utils.get_formatted_prompt( + data=kwargs, call_type=_call_type + ) + + _usage = response_obj.get("usage", {}) or {} + num_input_tokens = _usage.get("prompt_tokens", 0) + num_output_tokens = _usage.get("completion_tokens", 0) + + output_text = self.get_output_str_from_response( + response_obj=response_obj, kwargs=kwargs + ) + + request_record = LLMResponse( + latency_ms=_latency_ms, + status_code=200, + input_text=input_text, + output_text=output_text, + node_type=_call_type, + model=kwargs.get("model", "-"), + num_input_tokens=num_input_tokens, + num_output_tokens=num_output_tokens, + created_at=start_time.strftime( + "%Y-%m-%dT%H:%M:%S" + ), # timestamp str constructed in "%Y-%m-%dT%H:%M:%S" format + ) + + # dump to dict + request_dict = request_record.model_dump() + self.in_memory_records.append(request_dict) + + if len(self.in_memory_records) >= self.batch_size: + await self.flush_in_memory_records() + + async def flush_in_memory_records(self): + verbose_logger.debug("flushing in memory records") + response = await self.async_httpx_handler.post( + url=f"{self.base_url}/projects/{self.project_id}/observe/ingest", + headers=self.headers, + json={"records": self.in_memory_records}, + ) + + if response.status_code == 200: + verbose_logger.debug( + "Galileo Logger:successfully flushed in memory records" + ) + self.in_memory_records = [] + else: + verbose_logger.debug("Galileo Logger: failed to flush in memory records") + verbose_logger.debug( + "Galileo Logger error=%s, status code=%s", + response.text, + response.status_code, + ) + + async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time): + verbose_logger.debug(f"On Async Failure") diff --git a/litellm/litellm_core_utils/litellm_logging.py b/litellm/litellm_core_utils/litellm_logging.py index 4edbce5e15..381bcc1ac9 100644 --- a/litellm/litellm_core_utils/litellm_logging.py +++ b/litellm/litellm_core_utils/litellm_logging.py @@ -56,6 +56,7 @@ from ..integrations.clickhouse import ClickhouseLogger from ..integrations.custom_logger import CustomLogger from ..integrations.datadog import DataDogLogger from ..integrations.dynamodb import DyanmoDBLogger +from ..integrations.galileo import GalileoObserve from ..integrations.greenscale import GreenscaleLogger from ..integrations.helicone import HeliconeLogger from ..integrations.lago import LagoLogger @@ -153,11 +154,6 @@ class Logging: langfuse_secret=None, langfuse_host=None, ): - if call_type not in [item.value for item in CallTypes]: - allowed_values = ", ".join([item.value for item in CallTypes]) - raise ValueError( - f"Invalid call_type {call_type}. Allowed values: {allowed_values}" - ) if messages is not None: if isinstance(messages, str): messages = [ @@ -604,8 +600,7 @@ class Logging: verbose_logger.error( "LiteLLM.LoggingError: [Non-Blocking] Exception occurred while building complete streaming response in success logging {}\n{}".format( str(e), traceback.format_exc() - ), - log_level="ERROR", + ) ) complete_streaming_response = None else: @@ -1612,6 +1607,7 @@ class Logging: ) == False ): # custom logger class + callback.log_failure_event( start_time=start_time, end_time=end_time, @@ -1929,6 +1925,15 @@ def _init_custom_logger_compatible_class( _openmeter_logger = OpenMeterLogger() _in_memory_loggers.append(_openmeter_logger) return _openmeter_logger # type: ignore + + elif logging_integration == "galileo": + for callback in _in_memory_loggers: + if isinstance(callback, GalileoObserve): + return callback # type: ignore + + galileo_logger = GalileoObserve() + _in_memory_loggers.append(galileo_logger) + return galileo_logger # type: ignore elif logging_integration == "logfire": if "LOGFIRE_TOKEN" not in os.environ: raise ValueError("LOGFIRE_TOKEN not found in environment variables") @@ -1985,6 +1990,10 @@ def get_custom_logger_compatible_class( for callback in _in_memory_loggers: if isinstance(callback, OpenMeterLogger): return callback + elif logging_integration == "galileo": + for callback in _in_memory_loggers: + if isinstance(callback, GalileoObserve): + return callback elif logging_integration == "logfire": if "LOGFIRE_TOKEN" not in os.environ: raise ValueError("LOGFIRE_TOKEN not found in environment variables") diff --git a/litellm/llms/anthropic.py b/litellm/llms/anthropic.py index e87618f02e..a4521a7031 100644 --- a/litellm/llms/anthropic.py +++ b/litellm/llms/anthropic.py @@ -49,7 +49,7 @@ class AnthropicConstants(Enum): class AnthropicError(Exception): def __init__(self, status_code, message): self.status_code = status_code - self.message = message + self.message: str = message self.request = httpx.Request( method="POST", url="https://api.anthropic.com/v1/messages" ) @@ -830,6 +830,16 @@ class ModelResponseIterator: .get("usage", {}) .get("output_tokens", 0), ) + elif type_chunk == "error": + """ + {"type":"error","error":{"details":null,"type":"api_error","message":"Internal server error"} } + """ + _error_dict = chunk.get("error", {}) or {} + message = _error_dict.get("message", None) or str(chunk) + raise AnthropicError( + message=message, + status_code=500, # it looks like Anthropic API does not return a status code in the chunk error - default to 500 + ) returned_chunk = GenericStreamingChunk( text=text, tool_use=tool_use, diff --git a/litellm/llms/nvidia_nim.py b/litellm/llms/nvidia_nim.py index ebcc84c13e..6d2e4316b2 100644 --- a/litellm/llms/nvidia_nim.py +++ b/litellm/llms/nvidia_nim.py @@ -58,21 +58,80 @@ class NvidiaNimConfig: and v is not None } - def get_supported_openai_params(self): - return [ - "stream", - "temperature", - "top_p", - "frequency_penalty", - "presence_penalty", - "max_tokens", - "stop", - ] + def get_supported_openai_params(self, model: str) -> list: + """ + Get the supported OpenAI params for the given model + + + Updated on July 5th, 2024 - based on https://docs.api.nvidia.com/nim/reference + """ + if model in [ + "google/recurrentgemma-2b", + "google/gemma-2-27b-it", + "google/gemma-2-9b-it", + "gemma-2-9b-it", + ]: + return ["stream", "temperature", "top_p", "max_tokens", "stop", "seed"] + elif model == "nvidia/nemotron-4-340b-instruct": + return [ + "stream", + "temperature", + "top_p", + "max_tokens", + ] + elif model == "nvidia/nemotron-4-340b-reward": + return [ + "stream", + ] + elif model in ["google/codegemma-1.1-7b"]: + # most params - but no 'seed' :( + return [ + "stream", + "temperature", + "top_p", + "frequency_penalty", + "presence_penalty", + "max_tokens", + "stop", + ] + else: + # DEFAULT Case - The vast majority of Nvidia NIM Models lie here + # "upstage/solar-10.7b-instruct", + # "snowflake/arctic", + # "seallms/seallm-7b-v2.5", + # "nvidia/llama3-chatqa-1.5-8b", + # "nvidia/llama3-chatqa-1.5-70b", + # "mistralai/mistral-large", + # "mistralai/mixtral-8x22b-instruct-v0.1", + # "mistralai/mixtral-8x7b-instruct-v0.1", + # "mistralai/mistral-7b-instruct-v0.3", + # "mistralai/mistral-7b-instruct-v0.2", + # "mistralai/codestral-22b-instruct-v0.1", + # "microsoft/phi-3-small-8k-instruct", + # "microsoft/phi-3-small-128k-instruct", + # "microsoft/phi-3-mini-4k-instruct", + # "microsoft/phi-3-mini-128k-instruct", + # "microsoft/phi-3-medium-4k-instruct", + # "microsoft/phi-3-medium-128k-instruct", + # "meta/llama3-70b-instruct", + # "meta/llama3-8b-instruct", + # "meta/llama2-70b", + # "meta/codellama-70b", + return [ + "stream", + "temperature", + "top_p", + "frequency_penalty", + "presence_penalty", + "max_tokens", + "stop", + "seed", + ] def map_openai_params( - self, non_default_params: dict, optional_params: dict + self, model: str, non_default_params: dict, optional_params: dict ) -> dict: - supported_openai_params = self.get_supported_openai_params() + supported_openai_params = self.get_supported_openai_params(model=model) for param, value in non_default_params.items(): if param in supported_openai_params: optional_params[param] = value diff --git a/litellm/model_prices_and_context_window_backup.json b/litellm/model_prices_and_context_window_backup.json index be2fab51d1..4f9242af4b 100644 --- a/litellm/model_prices_and_context_window_backup.json +++ b/litellm/model_prices_and_context_window_backup.json @@ -397,7 +397,7 @@ "input_cost_per_second": 0, "output_cost_per_second": 0.0001, "litellm_provider": "openai" - }, + }, "tts-1": { "mode": "audio_speech", "input_cost_per_character": 0.000015, @@ -2022,10 +2022,10 @@ "max_tokens": 8192, "max_input_tokens": 2097152, "max_output_tokens": 8192, - "input_cost_per_token": 0.00000035, - "input_cost_per_token_above_128k_tokens": 0.0000007, - "output_cost_per_token": 0.00000105, - "output_cost_per_token_above_128k_tokens": 0.0000021, + "input_cost_per_token": 0.0000035, + "input_cost_per_token_above_128k_tokens": 0.000007, + "output_cost_per_token": 0.0000105, + "output_cost_per_token_above_128k_tokens": 0.000021, "litellm_provider": "gemini", "mode": "chat", "supports_system_messages": true, @@ -2033,16 +2033,16 @@ "supports_vision": true, "supports_tool_choice": true, "supports_response_schema": true, - "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models" + "source": "https://ai.google.dev/pricing" }, "gemini/gemini-1.5-pro-latest": { "max_tokens": 8192, "max_input_tokens": 1048576, "max_output_tokens": 8192, - "input_cost_per_token": 0.00000035, - "input_cost_per_token_above_128k_tokens": 0.0000007, + "input_cost_per_token": 0.0000035, + "input_cost_per_token_above_128k_tokens": 0.000007, "output_cost_per_token": 0.00000105, - "output_cost_per_token_above_128k_tokens": 0.0000021, + "output_cost_per_token_above_128k_tokens": 0.000021, "litellm_provider": "gemini", "mode": "chat", "supports_system_messages": true, @@ -2050,7 +2050,7 @@ "supports_vision": true, "supports_tool_choice": true, "supports_response_schema": true, - "source": "https://ai.google.dev/models/gemini" + "source": "https://ai.google.dev/pricing" }, "gemini/gemini-pro-vision": { "max_tokens": 2048, diff --git a/litellm/proxy/guardrails/guardrail_helpers.py b/litellm/proxy/guardrails/guardrail_helpers.py index 8a25abf3a9..682428cc9d 100644 --- a/litellm/proxy/guardrails/guardrail_helpers.py +++ b/litellm/proxy/guardrails/guardrail_helpers.py @@ -1,5 +1,6 @@ from litellm._logging import verbose_proxy_logger from litellm.proxy.guardrails.init_guardrails import guardrail_name_config_map +from litellm.proxy.proxy_server import UserAPIKeyAuth from litellm.types.guardrails import * @@ -47,3 +48,44 @@ async def should_proceed_based_on_metadata(data: dict, guardrail_name: str) -> b return False return True + + +async def should_proceed_based_on_api_key( + user_api_key_dict: UserAPIKeyAuth, guardrail_name: str +) -> bool: + """ + checks if this guardrail should be applied to this call + """ + if user_api_key_dict.permissions is not None: + # { prompt_injection: true, rail_2: false } + verbose_proxy_logger.debug( + "Guardrails valid for API Key= %s - checking which to apply", + user_api_key_dict.permissions, + ) + + if not isinstance(user_api_key_dict.permissions, dict): + verbose_proxy_logger.error( + "API Key permissions must be a dict - %s running guardrail %s", + user_api_key_dict, + guardrail_name, + ) + return True + + for _guardrail_name, should_run in user_api_key_dict.permissions.items(): + if should_run is False: + verbose_proxy_logger.debug( + "Guardrail %s skipped because request set to False", + _guardrail_name, + ) + continue + + # lookup the guardrail in guardrail_name_config_map + guardrail_item: GuardrailItem = guardrail_name_config_map[_guardrail_name] + + guardrail_callbacks = guardrail_item.callbacks + if guardrail_name in guardrail_callbacks: + return True + + # Do not proceeed if - "metadata": { "guardrails": { "lakera_prompt_injection": false } } + return False + return True diff --git a/litellm/proxy/utils.py b/litellm/proxy/utils.py index f092511072..32b74be7c6 100644 --- a/litellm/proxy/utils.py +++ b/litellm/proxy/utils.py @@ -7,6 +7,7 @@ import os import re import smtplib import subprocess +import threading import time import traceback from datetime import datetime, timedelta @@ -49,6 +50,7 @@ from litellm.proxy.hooks.max_budget_limiter import _PROXY_MaxBudgetLimiter from litellm.proxy.hooks.parallel_request_limiter import ( _PROXY_MaxParallelRequestsHandler, ) +from litellm.types.utils import CallTypes if TYPE_CHECKING: from opentelemetry.trace import Span as _Span @@ -354,35 +356,6 @@ class ProxyLogging: print_verbose(f"final data being sent to {call_type} call: {data}") return data except Exception as e: - if "litellm_logging_obj" in data: - logging_obj: litellm.litellm_core_utils.litellm_logging.Logging = data[ - "litellm_logging_obj" - ] - - ## ASYNC FAILURE HANDLER ## - error_message = "" - if isinstance(e, HTTPException): - if isinstance(e.detail, str): - error_message = e.detail - elif isinstance(e.detail, dict): - error_message = json.dumps(e.detail) - else: - error_message = str(e) - else: - error_message = str(e) - error_raised = Exception(f"{error_message}") - await logging_obj.async_failure_handler( - exception=error_raised, - traceback_exception=traceback.format_exc(), - ) - - ## SYNC FAILURE HANDLER ## - try: - logging_obj.failure_handler( - error_raised, traceback.format_exc() - ) # DO NOT MAKE THREADED - router retry fallback relies on this! - except Exception as error_val: - pass raise e async def during_call_hook( @@ -597,25 +570,39 @@ class ProxyLogging: ) ### LOGGING ### - litellm_logging_obj: Optional[Logging] = request_data.get( - "litellm_logging_obj", None - ) - if isinstance(original_exception, HTTPException): + litellm_logging_obj: Optional[Logging] = request_data.get( + "litellm_logging_obj", None + ) if litellm_logging_obj is None: + import uuid + + request_data["litellm_call_id"] = str(uuid.uuid4()) litellm_logging_obj, data = litellm.utils.function_setup( original_function="IGNORE_THIS", rules_obj=litellm.utils.Rules(), start_time=datetime.now(), **request_data, ) - # log the custom exception - await litellm_logging_obj.async_failure_handler( - exception=original_exception, - traceback_exception=traceback.format_exc(), - start_time=time.time(), - end_time=time.time(), - ) + + if litellm_logging_obj is not None: + # log the custom exception + await litellm_logging_obj.async_failure_handler( + exception=original_exception, + traceback_exception=traceback.format_exc(), + start_time=time.time(), + end_time=time.time(), + ) + + threading.Thread( + target=litellm_logging_obj.failure_handler, + args=( + original_exception, + traceback.format_exc(), + time.time(), + time.time(), + ), + ).start() for callback in litellm.callbacks: try: diff --git a/litellm/tests/test_caching.py b/litellm/tests/test_caching.py index c5e9c7f1fc..fa35f75de2 100644 --- a/litellm/tests/test_caching.py +++ b/litellm/tests/test_caching.py @@ -1607,7 +1607,17 @@ def test_caching_redis_simple(caplog): print(m) print(time.time() - s2) + redis_async_caching_error = False + redis_service_logging_error = False captured_logs = [rec.message for rec in caplog.records] - assert "LiteLLM Redis Caching: async set" not in captured_logs - assert "ServiceLogging.async_service_success_hook" not in captured_logs + print(f"captured_logs: {captured_logs}") + for item in captured_logs: + if "Error connecting to Async Redis client" in item: + redis_async_caching_error = True + + if "ServiceLogging.async_service_success_hook" in item: + redis_service_logging_error = True + + assert redis_async_caching_error is False + assert redis_service_logging_error is False diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py index 40c15d06df..0598c52dfe 100644 --- a/litellm/tests/test_completion.py +++ b/litellm/tests/test_completion.py @@ -3602,6 +3602,8 @@ def test_completion_nvidia_nim(): "content": "What's the weather like in Boston today in Fahrenheit?", } ], + presence_penalty=0.5, + frequency_penalty=0.1, ) # Add any assertions here to check the response print(response) diff --git a/litellm/tests/test_proxy_reject_logging.py b/litellm/tests/test_proxy_reject_logging.py index 7edd703815..865566d009 100644 --- a/litellm/tests/test_proxy_reject_logging.py +++ b/litellm/tests/test_proxy_reject_logging.py @@ -23,6 +23,8 @@ import os sys.path.insert( 0, os.path.abspath("../..") ) # Adds the parent directory to the system path +from typing import Literal + import pytest from fastapi import Request, Response from starlette.datastructures import URL @@ -49,18 +51,32 @@ from litellm.router import Router class testLogger(CustomLogger): def __init__(self): - self.reaches_failure_event = False + self.reaches_sync_failure_event = False + self.reaches_async_failure_event = False - async def async_pre_call_check(self, deployment: dict): + async def async_pre_call_hook( + self, + user_api_key_dict: UserAPIKeyAuth, + cache: DualCache, + data: dict, + call_type: Literal[ + "completion", + "text_completion", + "embeddings", + "image_generation", + "moderation", + "audio_transcription", + ], + ): raise HTTPException( status_code=429, detail={"error": "Max parallel request limit reached"} ) async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time): - self.reaches_failure_event = True - return await super().async_log_failure_event( - kwargs, response_obj, start_time, end_time - ) + self.reaches_async_failure_event = True + + def log_failure_event(self, kwargs, response_obj, start_time, end_time): + self.reaches_sync_failure_event = True router = Router( @@ -92,15 +108,15 @@ router = Router( ], }, ), - # ("/v1/completions", {"model": "fake-model", "prompt": "ping"}), - # ( - # "/v1/embeddings", - # { - # "input": "The food was delicious and the waiter...", - # "model": "text-embedding-ada-002", - # "encoding_format": "float", - # }, - # ), + ("/v1/completions", {"model": "fake-model", "prompt": "ping"}), + ( + "/v1/embeddings", + { + "input": "The food was delicious and the waiter...", + "model": "text-embedding-ada-002", + "encoding_format": "float", + }, + ), ], ) @pytest.mark.asyncio @@ -169,4 +185,6 @@ async def test_chat_completion_request_with_redaction(route, body): pass await asyncio.sleep(3) - assert _test_logger.reaches_failure_event is True + assert _test_logger.reaches_async_failure_event is True + + assert _test_logger.reaches_sync_failure_event is True diff --git a/litellm/utils.py b/litellm/utils.py index 50e31053da..d17ba8911c 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -531,6 +531,8 @@ def function_setup( call_type == CallTypes.aspeech.value or call_type == CallTypes.speech.value ): messages = kwargs.get("input", "speech") + else: + messages = "default-message-value" stream = True if "stream" in kwargs and kwargs["stream"] == True else False logging_obj = litellm.litellm_core_utils.litellm_logging.Logging( model=model, @@ -561,10 +563,8 @@ def function_setup( ) return logging_obj, kwargs except Exception as e: - import logging - - logging.debug( - f"[Non-Blocking] {traceback.format_exc()}; args - {args}; kwargs - {kwargs}" + verbose_logger.error( + f"litellm.utils.py::function_setup() - [Non-Blocking] {traceback.format_exc()}; args - {args}; kwargs - {kwargs}" ) raise e @@ -3184,7 +3184,9 @@ def get_optional_params( ) _check_valid_arg(supported_params=supported_params) optional_params = litellm.NvidiaNimConfig().map_openai_params( - non_default_params=non_default_params, optional_params=optional_params + model=model, + non_default_params=non_default_params, + optional_params=optional_params, ) elif custom_llm_provider == "fireworks_ai": supported_params = get_supported_openai_params( @@ -3776,7 +3778,7 @@ def get_supported_openai_params( elif custom_llm_provider == "fireworks_ai": return litellm.FireworksAIConfig().get_supported_openai_params() elif custom_llm_provider == "nvidia_nim": - return litellm.NvidiaNimConfig().get_supported_openai_params() + return litellm.NvidiaNimConfig().get_supported_openai_params(model=model) elif custom_llm_provider == "volcengine": return litellm.VolcEngineConfig().get_supported_openai_params(model=model) elif custom_llm_provider == "groq": diff --git a/model_prices_and_context_window.json b/model_prices_and_context_window.json index be2fab51d1..4f9242af4b 100644 --- a/model_prices_and_context_window.json +++ b/model_prices_and_context_window.json @@ -397,7 +397,7 @@ "input_cost_per_second": 0, "output_cost_per_second": 0.0001, "litellm_provider": "openai" - }, + }, "tts-1": { "mode": "audio_speech", "input_cost_per_character": 0.000015, @@ -2022,10 +2022,10 @@ "max_tokens": 8192, "max_input_tokens": 2097152, "max_output_tokens": 8192, - "input_cost_per_token": 0.00000035, - "input_cost_per_token_above_128k_tokens": 0.0000007, - "output_cost_per_token": 0.00000105, - "output_cost_per_token_above_128k_tokens": 0.0000021, + "input_cost_per_token": 0.0000035, + "input_cost_per_token_above_128k_tokens": 0.000007, + "output_cost_per_token": 0.0000105, + "output_cost_per_token_above_128k_tokens": 0.000021, "litellm_provider": "gemini", "mode": "chat", "supports_system_messages": true, @@ -2033,16 +2033,16 @@ "supports_vision": true, "supports_tool_choice": true, "supports_response_schema": true, - "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models" + "source": "https://ai.google.dev/pricing" }, "gemini/gemini-1.5-pro-latest": { "max_tokens": 8192, "max_input_tokens": 1048576, "max_output_tokens": 8192, - "input_cost_per_token": 0.00000035, - "input_cost_per_token_above_128k_tokens": 0.0000007, + "input_cost_per_token": 0.0000035, + "input_cost_per_token_above_128k_tokens": 0.000007, "output_cost_per_token": 0.00000105, - "output_cost_per_token_above_128k_tokens": 0.0000021, + "output_cost_per_token_above_128k_tokens": 0.000021, "litellm_provider": "gemini", "mode": "chat", "supports_system_messages": true, @@ -2050,7 +2050,7 @@ "supports_vision": true, "supports_tool_choice": true, "supports_response_schema": true, - "source": "https://ai.google.dev/models/gemini" + "source": "https://ai.google.dev/pricing" }, "gemini/gemini-pro-vision": { "max_tokens": 2048, diff --git a/pyproject.toml b/pyproject.toml index 42bf87adba..b12d017dc5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "litellm" -version = "1.41.7" +version = "1.41.8" description = "Library to easily interface with LLM API providers" authors = ["BerriAI"] license = "MIT" @@ -90,7 +90,7 @@ requires = ["poetry-core", "wheel"] build-backend = "poetry.core.masonry.api" [tool.commitizen] -version = "1.41.7" +version = "1.41.8" version_files = [ "pyproject.toml:^version" ] diff --git a/requirements.txt b/requirements.txt index e71ab450bc..aa8cf1298c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -41,7 +41,7 @@ importlib-metadata==6.8.0 # for random utils tokenizers==0.14.0 # for calculating usage click==8.1.7 # for proxy cli jinja2==3.1.4 # for prompt templates -certifi==2023.7.22 # [TODO] clean up +certifi==2024.7.4 # [TODO] clean up aiohttp==3.9.0 # for network calls aioboto3==12.3.0 # for async sagemaker calls tenacity==8.2.3 # for retrying requests, when litellm.num_retries set