Merge branch 'main' into litellm_tts_pricing

2024-07-06 11:02:59 -07:00 · 2024-07-06 11:02:59 -07:00 · f7ebb84488
commit f7ebb84488
parent 407639cc7d 2452753e08
22 changed files with 586 additions and 117 deletions
--- a/cookbook/litellm_proxy_server/grafana_dashboard/dashboard_1/grafana_dashboard.json
+++ b/cookbook/litellm_proxy_server/grafana_dashboard/dashboard_1/grafana_dashboard.json
--- a/cookbook/litellm_proxy_server/grafana_dashboard/dashboard_1/readme.md
+++ b/cookbook/litellm_proxy_server/grafana_dashboard/dashboard_1/readme.md
@ -0,0 +1,6 @@
+## This folder contains the `json` for creating the following Grafana Dashboard
+
+### Pre-Requisites
+- Setup LiteLLM Proxy Prometheus Metrics https://docs.litellm.ai/docs/proxy/prometheus 
+
+![1716623265684](https://github.com/BerriAI/litellm/assets/29436595/0e12c57e-4a2d-4850-bd4f-e4294f87a814)
--- a/cookbook/litellm_proxy_server/grafana_dashboard/readme.md
+++ b/cookbook/litellm_proxy_server/grafana_dashboard/readme.md
@ -1,6 +1,6 @@
-## This folder contains the `json` for creating the following Grafana Dashboard
+## Contains example Grafana Dashboard made for LiteLLM Proxy Server
+
+This folder contains the `json` for creating Grafana Dashboards

 ### Pre-Requisites
 - Setup LiteLLM Proxy Prometheus Metrics https://docs.litellm.ai/docs/proxy/prometheus 
-
-![1716623265684](https://github.com/BerriAI/litellm/assets/29436595/0e12c57e-4a2d-4850-bd4f-e4294f87a814)
--- a/docs/my-website/docs/completion/vision.md
+++ b/docs/my-website/docs/completion/vision.md
@ -39,7 +39,7 @@ Use `litellm.supports_vision(model="")` -> returns `True` if model supports `vis

 ```python
 assert litellm.supports_vision(model="gpt-4-vision-preview") == True
-assert litellm.supports_vision(model="gemini-1.0-pro-visionn") == True
+assert litellm.supports_vision(model="gemini-1.0-pro-vision") == True
 assert litellm.supports_vision(model="gpt-3.5-turbo") == False
 ```

--- a/docs/my-website/docs/proxy/guardrails.md
+++ b/docs/my-website/docs/proxy/guardrails.md
@ -29,6 +29,9 @@ litellm_settings:
    - prompt_injection:  # your custom name for guardrail
        callbacks: [lakera_prompt_injection] # litellm callbacks to use
        default_on: true # will run on all llm requests when true
+    - pii_masking:            # your custom name for guardrail
+        callbacks: [presidio] # use the litellm presidio callback
+        default_on: false # by default this is off for all requests
    - hide_secrets_guard:
        callbacks: [hide_secrets]
        default_on: false
@ -37,6 +40,12 @@ litellm_settings:
        default_on: false
 ```

+:::info
+
+Since `pii_masking` is default Off for all requests, [you can switch it on per API Key](#switch-guardrails-onoff-per-api-key)
+
+:::
+
 ### 2. Test it

 Run litellm proxy
@ -185,6 +194,85 @@ print(response)

 </Tabs>

+## Switch Guardrails On/Off Per API Key
+
+❓ Use this when you need to switch guardrails on/off per API Key
+
+**Step 1** Create Key with `pii_masking` On 
+
+**NOTE:** We defined `pii_masking` [on step 1](#1-setup-guardrails-on-litellm-proxy-configyaml)
+
+👉 Set `"permissions": {"pii_masking": true}` with either `/key/generate` or `/key/update`
+
+This means the `pii_masking` guardrail is on for all requests from this API Key
+
+:::info
+
+If you need to switch `pii_masking` off for an API Key set `"permissions": {"pii_masking": false}` with either `/key/generate` or `/key/update`
+
+:::
+
+
+<Tabs>
+<TabItem value="/key/generate" label="/key/generate">
+
+```shell
+curl --location 'http://0.0.0.0:4000/key/generate' \
+    --header 'Authorization: Bearer sk-1234' \
+    --header 'Content-Type: application/json' \
+    --data '{
+        "permissions": {"pii_masking": true}
+}'
+```
+
+```shell
+# {"permissions":{"pii_masking":true},"key":"sk-jNm1Zar7XfNdZXp49Z1kSQ"}  
+```
+
+</TabItem>
+<TabItem value="/key/update" label="/key/update">
+
+```shell
+curl --location 'http://0.0.0.0:4000/key/update' \
+    --header 'Authorization: Bearer sk-1234' \
+    --header 'Content-Type: application/json' \
+    --data '{
+        "key": "sk-jNm1Zar7XfNdZXp49Z1kSQ",
+        "permissions": {"pii_masking": true}
+}'
+```
+
+```shell
+# {"permissions":{"pii_masking":true},"key":"sk-jNm1Zar7XfNdZXp49Z1kSQ"}  
+```
+
+</TabItem>
+</Tabs>
+
+**Step 2** Test it with new key
+
+```shell
+curl --location 'http://0.0.0.0:4000/chat/completions' \
+    --header 'Authorization: Bearer sk-jNm1Zar7XfNdZXp49Z1kSQ' \
+    --header 'Content-Type: application/json' \
+    --data '{
+    "model": "llama3",
+    "messages": [
+        {
+        "role": "user",
+        "content": "does my phone number look correct - +1 412-612-9992"
+        }
+    ]
+}'
+```
+
+Expect to NOT see `+1 412-612-9992` in your server logs on your callback. 
+
+:::info
+The `pii_masking` guardrail ran on this request because api key=sk-jNm1Zar7XfNdZXp49Z1kSQ has `"permissions": {"pii_masking": true}`
+:::
+
+


 ## Spec for `guardrails` on litellm config
@ -208,9 +296,9 @@ litellm_settings:

 #### Guardrail: `prompt_injection`: Configuration for detecting and preventing prompt injection attacks.

- `callbacks`: List of LiteLLM callbacks used for this guardrail. [Can be one of `[lakera_prompt_injection, hide_secrets, llmguard_moderations, llamaguard_moderations, google_text_moderation]`](enterprise#content-moderation)
+- `callbacks`: List of LiteLLM callbacks used for this guardrail. [Can be one of `[lakera_prompt_injection, hide_secrets, presidio, llmguard_moderations, llamaguard_moderations, google_text_moderation]`](enterprise#content-moderation)
 - `default_on`: Boolean flag determining if this guardrail runs on all LLM requests by default.
 #### Guardrail: `your-custom-guardrail`: Configuration for a user-defined custom guardrail.

- `callbacks`: List of callbacks for this custom guardrail. Can be one of `[lakera_prompt_injection, hide_secrets, llmguard_moderations, llamaguard_moderations, google_text_moderation]`
+- `callbacks`: List of callbacks for this custom guardrail. Can be one of `[lakera_prompt_injection, hide_secrets, presidio, llmguard_moderations, llamaguard_moderations, google_text_moderation]`
 - `default_on`: Boolean flag determining if this custom guardrail runs by default, set to false.
--- a/docs/my-website/docs/proxy/logging.md
+++ b/docs/my-website/docs/proxy/logging.md
@ -7,10 +7,13 @@ import TabItem from '@theme/TabItem';

 Log Proxy Input, Output, Exceptions using Langfuse, OpenTelemetry, Custom Callbacks, DataDog, DynamoDB, s3 Bucket

+## Table of Contents
+
 - [Logging to Langfuse](#logging-proxy-inputoutput---langfuse)
 - [Logging with OpenTelemetry (OpenTelemetry)](#logging-proxy-inputoutput-in-opentelemetry-format)
 - [Async Custom Callbacks](#custom-callback-class-async)
 - [Async Custom Callback APIs](#custom-callback-apis-async)
+- [Logging to Galileo](#logging-llm-io-to-galileo)
 - [Logging to OpenMeter](#logging-proxy-inputoutput---langfuse)
 - [Logging to s3 Buckets](#logging-proxy-inputoutput---s3-buckets)
 - [Logging to DataDog](#logging-proxy-inputoutput---datadog)
@ -1056,6 +1059,68 @@ litellm_settings:

 Start the LiteLLM Proxy and make a test request to verify the logs reached your callback API 

+
+## Logging LLM IO to Galileo
+[BETA]
+
+Log LLM I/O on [www.rungalileo.io](https://www.rungalileo.io/)
+
+:::info
+
+Beta Integration
+
+:::
+
+**Required Env Variables**
+
+```bash
+export GALILEO_BASE_URL=""  # For most users, this is the same as their console URL except with the word 'console' replaced by 'api' (e.g. http://www.console.galileo.myenterprise.com -> http://www.api.galileo.myenterprise.com)
+export GALILEO_PROJECT_ID=""
+export GALILEO_USERNAME=""
+export GALILEO_PASSWORD=""
+```
+
+### Quick Start 
+
+1. Add to Config.yaml
+```yaml
+model_list:
+- litellm_params:
+    api_base: https://exampleopenaiendpoint-production.up.railway.app/
+    api_key: my-fake-key
+    model: openai/my-fake-model
+  model_name: fake-openai-endpoint
+
+litellm_settings:
+  success_callback: ["galileo"] # 👈 KEY CHANGE
+```
+
+2. Start Proxy
+
+```
+litellm --config /path/to/config.yaml
+```
+
+3. Test it! 
+
+```bash
+curl --location 'http://0.0.0.0:4000/chat/completions' \
+--header 'Content-Type: application/json' \
+--data ' {
+      "model": "fake-openai-endpoint",
+      "messages": [
+        {
+          "role": "user",
+          "content": "what llm are you"
+        }
+      ],
+    }
+'
+```
+
+
+🎉 That's it - Expect to see your Logs on your Galileo Dashboard
+
 ## Logging Proxy Cost + Usage - OpenMeter

 Bill customers according to their LLM API usage with [OpenMeter](../observability/openmeter.md)
--- a/docs/my-website/docs/proxy/prometheus.md
+++ b/docs/my-website/docs/proxy/prometheus.md
@ -132,3 +132,9 @@ litellm_settings:
 | `litellm_redis_latency`         | histogram latency for redis calls     |
 | `litellm_redis_fails`         | Number of failed redis calls    |
 | `litellm_self_latency`         | Histogram latency for successful litellm api call    |
+
+## 🔥 Community Maintained Grafana Dashboards 
+
+Link to Grafana Dashboards made by LiteLLM community 
+
+https://github.com/BerriAI/litellm/tree/main/cookbook/litellm_proxy_server/grafana_dashboard
--- a/litellm/caching.py
+++ b/litellm/caching.py
@ -248,8 +248,14 @@ class RedisCache(BaseCache):
            # asyncio.get_running_loop().create_task(self.ping())
            result = asyncio.get_running_loop().create_task(self.ping())
        except Exception as e:
+            if "no running event loop" in str(e):
+                verbose_logger.debug(
+                    "Ignoring async redis ping. No running event loop."
+                )
+            else:
                verbose_logger.error(
-                "Error connecting to Async Redis client", extra={"error": str(e)}
+                    "Error connecting to Async Redis client - {}".format(str(e)),
+                    extra={"error": str(e)},
                )

        ### SYNC HEALTH PING ###
--- a/litellm/integrations/galileo.py
+++ b/litellm/integrations/galileo.py
@ -0,0 +1,159 @@
+import os
+from datetime import datetime
+from typing import Any, Dict, List, Optional
+
+import httpx
+from pydantic import BaseModel, Field
+
+import litellm
+from litellm._logging import verbose_logger
+from litellm.integrations.custom_logger import CustomLogger
+from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
+
+
+# from here: https://docs.rungalileo.io/galileo/gen-ai-studio-products/galileo-observe/how-to/logging-data-via-restful-apis#structuring-your-records
+class LLMResponse(BaseModel):
+    latency_ms: int
+    status_code: int
+    input_text: str
+    output_text: str
+    node_type: str
+    model: str
+    num_input_tokens: int
+    num_output_tokens: int
+    output_logprobs: Optional[Dict[str, Any]] = Field(
+        default=None,
+        description="Optional. When available, logprobs are used to compute Uncertainty.",
+    )
+    created_at: str = Field(
+        ..., description='timestamp constructed in "%Y-%m-%dT%H:%M:%S" format'
+    )
+    tags: Optional[List[str]] = None
+    user_metadata: Optional[Dict[str, Any]] = None
+
+
+class GalileoObserve(CustomLogger):
+    def __init__(self) -> None:
+        self.in_memory_records: List[dict] = []
+        self.batch_size = 1
+        self.base_url = os.getenv("GALILEO_BASE_URL", None)
+        self.project_id = os.getenv("GALILEO_PROJECT_ID", None)
+        self.headers = None
+        self.async_httpx_handler = AsyncHTTPHandler(
+            timeout=httpx.Timeout(timeout=600.0, connect=5.0)
+        )
+        pass
+
+    def set_galileo_headers(self):
+        # following https://docs.rungalileo.io/galileo/gen-ai-studio-products/galileo-observe/how-to/logging-data-via-restful-apis#logging-your-records
+
+        headers = {
+            "accept": "application/json",
+            "Content-Type": "application/x-www-form-urlencoded",
+        }
+        galileo_login_response = self.async_httpx_handler.post(
+            url=f"{self.base_url}/login",
+            headers=headers,
+            data={
+                "username": os.getenv("GALILEO_USERNAME"),
+                "password": os.getenv("GALILEO_PASSWORD"),
+            },
+        )
+
+        access_token = galileo_login_response.json()["access_token"]
+
+        self.headers = {
+            "accept": "application/json",
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {access_token}",
+        }
+
+    def get_output_str_from_response(self, response_obj, kwargs):
+        output = None
+        if response_obj is not None and (
+            kwargs.get("call_type", None) == "embedding"
+            or isinstance(response_obj, litellm.EmbeddingResponse)
+        ):
+            output = None
+        elif response_obj is not None and isinstance(
+            response_obj, litellm.ModelResponse
+        ):
+            output = response_obj["choices"][0]["message"].json()
+        elif response_obj is not None and isinstance(
+            response_obj, litellm.TextCompletionResponse
+        ):
+            output = response_obj.choices[0].text
+        elif response_obj is not None and isinstance(
+            response_obj, litellm.ImageResponse
+        ):
+            output = response_obj["data"]
+
+        return output
+
+    async def async_log_success_event(
+        self,
+        kwargs,
+        start_time,
+        end_time,
+        response_obj,
+    ):
+        verbose_logger.debug(f"On Async Success")
+
+        _latency_ms = int((end_time - start_time).total_seconds() * 1000)
+        _call_type = kwargs.get("call_type", "litellm")
+        input_text = litellm.utils.get_formatted_prompt(
+            data=kwargs, call_type=_call_type
+        )
+
+        _usage = response_obj.get("usage", {}) or {}
+        num_input_tokens = _usage.get("prompt_tokens", 0)
+        num_output_tokens = _usage.get("completion_tokens", 0)
+
+        output_text = self.get_output_str_from_response(
+            response_obj=response_obj, kwargs=kwargs
+        )
+
+        request_record = LLMResponse(
+            latency_ms=_latency_ms,
+            status_code=200,
+            input_text=input_text,
+            output_text=output_text,
+            node_type=_call_type,
+            model=kwargs.get("model", "-"),
+            num_input_tokens=num_input_tokens,
+            num_output_tokens=num_output_tokens,
+            created_at=start_time.strftime(
+                "%Y-%m-%dT%H:%M:%S"
+            ),  # timestamp str constructed in "%Y-%m-%dT%H:%M:%S" format
+        )
+
+        # dump to dict
+        request_dict = request_record.model_dump()
+        self.in_memory_records.append(request_dict)
+
+        if len(self.in_memory_records) >= self.batch_size:
+            await self.flush_in_memory_records()
+
+    async def flush_in_memory_records(self):
+        verbose_logger.debug("flushing in memory records")
+        response = await self.async_httpx_handler.post(
+            url=f"{self.base_url}/projects/{self.project_id}/observe/ingest",
+            headers=self.headers,
+            json={"records": self.in_memory_records},
+        )
+
+        if response.status_code == 200:
+            verbose_logger.debug(
+                "Galileo Logger:successfully flushed in memory records"
+            )
+            self.in_memory_records = []
+        else:
+            verbose_logger.debug("Galileo Logger: failed to flush in memory records")
+            verbose_logger.debug(
+                "Galileo Logger error=%s, status code=%s",
+                response.text,
+                response.status_code,
+            )
+
+    async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time):
+        verbose_logger.debug(f"On Async Failure")
--- a/litellm/litellm_core_utils/litellm_logging.py
+++ b/litellm/litellm_core_utils/litellm_logging.py
@ -56,6 +56,7 @@ from ..integrations.clickhouse import ClickhouseLogger
 from ..integrations.custom_logger import CustomLogger
 from ..integrations.datadog import DataDogLogger
 from ..integrations.dynamodb import DyanmoDBLogger
+from ..integrations.galileo import GalileoObserve
 from ..integrations.greenscale import GreenscaleLogger
 from ..integrations.helicone import HeliconeLogger
 from ..integrations.lago import LagoLogger
@ -153,11 +154,6 @@ class Logging:
        langfuse_secret=None,
        langfuse_host=None,
    ):
-        if call_type not in [item.value for item in CallTypes]:
-            allowed_values = ", ".join([item.value for item in CallTypes])
-            raise ValueError(
-                f"Invalid call_type {call_type}. Allowed values: {allowed_values}"
-            )
        if messages is not None:
            if isinstance(messages, str):
                messages = [
@ -604,8 +600,7 @@ class Logging:
                        verbose_logger.error(
                            "LiteLLM.LoggingError: [Non-Blocking] Exception occurred while building complete streaming response in success logging {}\n{}".format(
                                str(e), traceback.format_exc()
-                            ),
-                            log_level="ERROR",
+                            )
                        )
                        complete_streaming_response = None
                else:
@ -1612,6 +1607,7 @@ class Logging:
                        )
                        == False
                    ):  # custom logger class
+
                        callback.log_failure_event(
                            start_time=start_time,
                            end_time=end_time,
@ -1929,6 +1925,15 @@ def _init_custom_logger_compatible_class(
        _openmeter_logger = OpenMeterLogger()
        _in_memory_loggers.append(_openmeter_logger)
        return _openmeter_logger  # type: ignore
+
+    elif logging_integration == "galileo":
+        for callback in _in_memory_loggers:
+            if isinstance(callback, GalileoObserve):
+                return callback  # type: ignore
+
+        galileo_logger = GalileoObserve()
+        _in_memory_loggers.append(galileo_logger)
+        return galileo_logger  # type: ignore
    elif logging_integration == "logfire":
        if "LOGFIRE_TOKEN" not in os.environ:
            raise ValueError("LOGFIRE_TOKEN not found in environment variables")
@ -1985,6 +1990,10 @@ def get_custom_logger_compatible_class(
        for callback in _in_memory_loggers:
            if isinstance(callback, OpenMeterLogger):
                return callback
+    elif logging_integration == "galileo":
+        for callback in _in_memory_loggers:
+            if isinstance(callback, GalileoObserve):
+                return callback
    elif logging_integration == "logfire":
        if "LOGFIRE_TOKEN" not in os.environ:
            raise ValueError("LOGFIRE_TOKEN not found in environment variables")
--- a/litellm/llms/anthropic.py
+++ b/litellm/llms/anthropic.py
@ -49,7 +49,7 @@ class AnthropicConstants(Enum):
 class AnthropicError(Exception):
    def __init__(self, status_code, message):
        self.status_code = status_code
-        self.message = message
+        self.message: str = message
        self.request = httpx.Request(
            method="POST", url="https://api.anthropic.com/v1/messages"
        )
@ -830,6 +830,16 @@ class ModelResponseIterator:
                    .get("usage", {})
                    .get("output_tokens", 0),
                )
+            elif type_chunk == "error":
+                """
+                {"type":"error","error":{"details":null,"type":"api_error","message":"Internal server error"}      }
+                """
+                _error_dict = chunk.get("error", {}) or {}
+                message = _error_dict.get("message", None) or str(chunk)
+                raise AnthropicError(
+                    message=message,
+                    status_code=500,  # it looks like Anthropic API does not return a status code in the chunk error - default to 500
+                )
            returned_chunk = GenericStreamingChunk(
                text=text,
                tool_use=tool_use,
--- a/litellm/llms/nvidia_nim.py
+++ b/litellm/llms/nvidia_nim.py
@ -58,7 +58,33 @@ class NvidiaNimConfig:
            and v is not None
        }

-    def get_supported_openai_params(self):
+    def get_supported_openai_params(self, model: str) -> list:
+        """
+        Get the supported OpenAI params for the given model
+
+
+        Updated on July 5th, 2024 - based on https://docs.api.nvidia.com/nim/reference
+        """
+        if model in [
+            "google/recurrentgemma-2b",
+            "google/gemma-2-27b-it",
+            "google/gemma-2-9b-it",
+            "gemma-2-9b-it",
+        ]:
+            return ["stream", "temperature", "top_p", "max_tokens", "stop", "seed"]
+        elif model == "nvidia/nemotron-4-340b-instruct":
+            return [
+                "stream",
+                "temperature",
+                "top_p",
+                "max_tokens",
+            ]
+        elif model == "nvidia/nemotron-4-340b-reward":
+            return [
+                "stream",
+            ]
+        elif model in ["google/codegemma-1.1-7b"]:
+            # most params - but no 'seed' :(
            return [
                "stream",
                "temperature",
@ -68,11 +94,44 @@ class NvidiaNimConfig:
                "max_tokens",
                "stop",
            ]
+        else:
+            # DEFAULT Case - The vast majority of Nvidia NIM Models lie here
+            # "upstage/solar-10.7b-instruct",
+            # "snowflake/arctic",
+            # "seallms/seallm-7b-v2.5",
+            # "nvidia/llama3-chatqa-1.5-8b",
+            # "nvidia/llama3-chatqa-1.5-70b",
+            # "mistralai/mistral-large",
+            # "mistralai/mixtral-8x22b-instruct-v0.1",
+            # "mistralai/mixtral-8x7b-instruct-v0.1",
+            # "mistralai/mistral-7b-instruct-v0.3",
+            # "mistralai/mistral-7b-instruct-v0.2",
+            # "mistralai/codestral-22b-instruct-v0.1",
+            # "microsoft/phi-3-small-8k-instruct",
+            # "microsoft/phi-3-small-128k-instruct",
+            # "microsoft/phi-3-mini-4k-instruct",
+            # "microsoft/phi-3-mini-128k-instruct",
+            # "microsoft/phi-3-medium-4k-instruct",
+            # "microsoft/phi-3-medium-128k-instruct",
+            # "meta/llama3-70b-instruct",
+            # "meta/llama3-8b-instruct",
+            # "meta/llama2-70b",
+            # "meta/codellama-70b",
+            return [
+                "stream",
+                "temperature",
+                "top_p",
+                "frequency_penalty",
+                "presence_penalty",
+                "max_tokens",
+                "stop",
+                "seed",
+            ]

    def map_openai_params(
-        self, non_default_params: dict, optional_params: dict
+        self, model: str, non_default_params: dict, optional_params: dict
    ) -> dict:
-        supported_openai_params = self.get_supported_openai_params()
+        supported_openai_params = self.get_supported_openai_params(model=model)
        for param, value in non_default_params.items():
            if param in supported_openai_params:
                optional_params[param] = value
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@ -2022,10 +2022,10 @@
        "max_tokens": 8192,
        "max_input_tokens": 2097152,
        "max_output_tokens": 8192,
-        "input_cost_per_token": 0.00000035, 
-        "input_cost_per_token_above_128k_tokens": 0.0000007, 
-        "output_cost_per_token": 0.00000105, 
-        "output_cost_per_token_above_128k_tokens": 0.0000021, 
+        "input_cost_per_token": 0.0000035, 
+        "input_cost_per_token_above_128k_tokens": 0.000007, 
+        "output_cost_per_token": 0.0000105, 
+        "output_cost_per_token_above_128k_tokens": 0.000021, 
        "litellm_provider": "gemini",
        "mode": "chat",
        "supports_system_messages": true,
@ -2033,16 +2033,16 @@
        "supports_vision": true,
        "supports_tool_choice": true, 
        "supports_response_schema": true, 
-        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
+        "source": "https://ai.google.dev/pricing"
    },
    "gemini/gemini-1.5-pro-latest": {
        "max_tokens": 8192,
        "max_input_tokens": 1048576,
        "max_output_tokens": 8192,
-        "input_cost_per_token": 0.00000035, 
-        "input_cost_per_token_above_128k_tokens": 0.0000007, 
+        "input_cost_per_token": 0.0000035, 
+        "input_cost_per_token_above_128k_tokens": 0.000007, 
        "output_cost_per_token": 0.00000105, 
-        "output_cost_per_token_above_128k_tokens": 0.0000021, 
+        "output_cost_per_token_above_128k_tokens": 0.000021, 
        "litellm_provider": "gemini",
        "mode": "chat",
        "supports_system_messages": true,
@ -2050,7 +2050,7 @@
        "supports_vision": true,
        "supports_tool_choice": true, 
        "supports_response_schema": true, 
-        "source": "https://ai.google.dev/models/gemini"
+        "source": "https://ai.google.dev/pricing"
    },
    "gemini/gemini-pro-vision": {
        "max_tokens": 2048,
--- a/litellm/proxy/guardrails/guardrail_helpers.py
+++ b/litellm/proxy/guardrails/guardrail_helpers.py
@ -1,5 +1,6 @@
 from litellm._logging import verbose_proxy_logger
 from litellm.proxy.guardrails.init_guardrails import guardrail_name_config_map
+from litellm.proxy.proxy_server import UserAPIKeyAuth
 from litellm.types.guardrails import *


@ -47,3 +48,44 @@ async def should_proceed_based_on_metadata(data: dict, guardrail_name: str) -> b
            return False

    return True
+
+
+async def should_proceed_based_on_api_key(
+    user_api_key_dict: UserAPIKeyAuth, guardrail_name: str
+) -> bool:
+    """
+    checks if this guardrail should be applied to this call
+    """
+    if user_api_key_dict.permissions is not None:
+        # { prompt_injection: true, rail_2: false }
+        verbose_proxy_logger.debug(
+            "Guardrails valid for API Key= %s - checking which to apply",
+            user_api_key_dict.permissions,
+        )
+
+        if not isinstance(user_api_key_dict.permissions, dict):
+            verbose_proxy_logger.error(
+                "API Key permissions must be a dict - %s running guardrail %s",
+                user_api_key_dict,
+                guardrail_name,
+            )
+            return True
+
+        for _guardrail_name, should_run in user_api_key_dict.permissions.items():
+            if should_run is False:
+                verbose_proxy_logger.debug(
+                    "Guardrail %s skipped because request set to False",
+                    _guardrail_name,
+                )
+                continue
+
+            # lookup the guardrail in guardrail_name_config_map
+            guardrail_item: GuardrailItem = guardrail_name_config_map[_guardrail_name]
+
+            guardrail_callbacks = guardrail_item.callbacks
+            if guardrail_name in guardrail_callbacks:
+                return True
+
+        # Do not proceeed if - "metadata": { "guardrails": { "lakera_prompt_injection": false } }
+        return False
+    return True
--- a/litellm/proxy/utils.py
+++ b/litellm/proxy/utils.py
@ -7,6 +7,7 @@ import os
 import re
 import smtplib
 import subprocess
+import threading
 import time
 import traceback
 from datetime import datetime, timedelta
@ -49,6 +50,7 @@ from litellm.proxy.hooks.max_budget_limiter import _PROXY_MaxBudgetLimiter
 from litellm.proxy.hooks.parallel_request_limiter import (
    _PROXY_MaxParallelRequestsHandler,
 )
+from litellm.types.utils import CallTypes

 if TYPE_CHECKING:
    from opentelemetry.trace import Span as _Span
@ -354,35 +356,6 @@ class ProxyLogging:
            print_verbose(f"final data being sent to {call_type} call: {data}")
            return data
        except Exception as e:
-            if "litellm_logging_obj" in data:
-                logging_obj: litellm.litellm_core_utils.litellm_logging.Logging = data[
-                    "litellm_logging_obj"
-                ]
-
-                ## ASYNC FAILURE HANDLER ##
-                error_message = ""
-                if isinstance(e, HTTPException):
-                    if isinstance(e.detail, str):
-                        error_message = e.detail
-                    elif isinstance(e.detail, dict):
-                        error_message = json.dumps(e.detail)
-                    else:
-                        error_message = str(e)
-                else:
-                    error_message = str(e)
-                error_raised = Exception(f"{error_message}")
-                await logging_obj.async_failure_handler(
-                    exception=error_raised,
-                    traceback_exception=traceback.format_exc(),
-                )
-
-                ## SYNC FAILURE HANDLER ##
-                try:
-                    logging_obj.failure_handler(
-                        error_raised, traceback.format_exc()
-                    )  # DO NOT MAKE THREADED - router retry fallback relies on this!
-                except Exception as error_val:
-                    pass
            raise e

    async def during_call_hook(
@ -597,18 +570,22 @@ class ProxyLogging:
            )

        ### LOGGING ###
+        if isinstance(original_exception, HTTPException):
            litellm_logging_obj: Optional[Logging] = request_data.get(
                "litellm_logging_obj", None
            )
-
-        if isinstance(original_exception, HTTPException):
            if litellm_logging_obj is None:
+                import uuid
+
+                request_data["litellm_call_id"] = str(uuid.uuid4())
                litellm_logging_obj, data = litellm.utils.function_setup(
                    original_function="IGNORE_THIS",
                    rules_obj=litellm.utils.Rules(),
                    start_time=datetime.now(),
                    **request_data,
                )
+
+            if litellm_logging_obj is not None:
                # log the custom exception
                await litellm_logging_obj.async_failure_handler(
                    exception=original_exception,
@ -617,6 +594,16 @@ class ProxyLogging:
                    end_time=time.time(),
                )

+                threading.Thread(
+                    target=litellm_logging_obj.failure_handler,
+                    args=(
+                        original_exception,
+                        traceback.format_exc(),
+                        time.time(),
+                        time.time(),
+                    ),
+                ).start()
+
        for callback in litellm.callbacks:
            try:
                _callback: Optional[CustomLogger] = None
--- a/litellm/tests/test_caching.py
+++ b/litellm/tests/test_caching.py
@ -1607,7 +1607,17 @@ def test_caching_redis_simple(caplog):
        print(m)
    print(time.time() - s2)

+    redis_async_caching_error = False
+    redis_service_logging_error = False
    captured_logs = [rec.message for rec in caplog.records]

-    assert "LiteLLM Redis Caching: async set" not in captured_logs
-    assert "ServiceLogging.async_service_success_hook" not in captured_logs
+    print(f"captured_logs: {captured_logs}")
+    for item in captured_logs:
+        if "Error connecting to Async Redis client" in item:
+            redis_async_caching_error = True
+
+        if "ServiceLogging.async_service_success_hook" in item:
+            redis_service_logging_error = True
+
+    assert redis_async_caching_error is False
+    assert redis_service_logging_error is False
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@ -3602,6 +3602,8 @@ def test_completion_nvidia_nim():
                    "content": "What's the weather like in Boston today in Fahrenheit?",
                }
            ],
+            presence_penalty=0.5,
+            frequency_penalty=0.1,
        )
        # Add any assertions here to check the response
        print(response)
--- a/litellm/tests/test_proxy_reject_logging.py
+++ b/litellm/tests/test_proxy_reject_logging.py
@ -23,6 +23,8 @@ import os
 sys.path.insert(
    0, os.path.abspath("../..")
 )  # Adds the parent directory to the system path
+from typing import Literal
+
 import pytest
 from fastapi import Request, Response
 from starlette.datastructures import URL
@ -49,18 +51,32 @@ from litellm.router import Router
 class testLogger(CustomLogger):

    def __init__(self):
-        self.reaches_failure_event = False
+        self.reaches_sync_failure_event = False
+        self.reaches_async_failure_event = False

-    async def async_pre_call_check(self, deployment: dict):
+    async def async_pre_call_hook(
+        self,
+        user_api_key_dict: UserAPIKeyAuth,
+        cache: DualCache,
+        data: dict,
+        call_type: Literal[
+            "completion",
+            "text_completion",
+            "embeddings",
+            "image_generation",
+            "moderation",
+            "audio_transcription",
+        ],
+    ):
        raise HTTPException(
            status_code=429, detail={"error": "Max parallel request limit reached"}
        )

    async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time):
-        self.reaches_failure_event = True
-        return await super().async_log_failure_event(
-            kwargs, response_obj, start_time, end_time
-        )
+        self.reaches_async_failure_event = True
+
+    def log_failure_event(self, kwargs, response_obj, start_time, end_time):
+        self.reaches_sync_failure_event = True


 router = Router(
@ -92,15 +108,15 @@ router = Router(
                ],
            },
        ),
-        # ("/v1/completions", {"model": "fake-model", "prompt": "ping"}),
-        # (
-        #     "/v1/embeddings",
-        #     {
-        #         "input": "The food was delicious and the waiter...",
-        #         "model": "text-embedding-ada-002",
-        #         "encoding_format": "float",
-        #     },
-        # ),
+        ("/v1/completions", {"model": "fake-model", "prompt": "ping"}),
+        (
+            "/v1/embeddings",
+            {
+                "input": "The food was delicious and the waiter...",
+                "model": "text-embedding-ada-002",
+                "encoding_format": "float",
+            },
+        ),
    ],
 )
@pytest.mark.asyncio
@ -169,4 +185,6 @@ async def test_chat_completion_request_with_redaction(route, body):
        pass
    await asyncio.sleep(3)

-    assert _test_logger.reaches_failure_event is True
+    assert _test_logger.reaches_async_failure_event is True
+
+    assert _test_logger.reaches_sync_failure_event is True
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -531,6 +531,8 @@ def function_setup(
            call_type == CallTypes.aspeech.value or call_type == CallTypes.speech.value
        ):
            messages = kwargs.get("input", "speech")
+        else:
+            messages = "default-message-value"
        stream = True if "stream" in kwargs and kwargs["stream"] == True else False
        logging_obj = litellm.litellm_core_utils.litellm_logging.Logging(
            model=model,
@ -561,10 +563,8 @@ def function_setup(
        )
        return logging_obj, kwargs
    except Exception as e:
-        import logging
-
-        logging.debug(
-            f"[Non-Blocking] {traceback.format_exc()}; args - {args}; kwargs - {kwargs}"
+        verbose_logger.error(
+            f"litellm.utils.py::function_setup() - [Non-Blocking] {traceback.format_exc()}; args - {args}; kwargs - {kwargs}"
        )
        raise e

@ -3184,7 +3184,9 @@ def get_optional_params(
        )
        _check_valid_arg(supported_params=supported_params)
        optional_params = litellm.NvidiaNimConfig().map_openai_params(
-            non_default_params=non_default_params, optional_params=optional_params
+            model=model,
+            non_default_params=non_default_params,
+            optional_params=optional_params,
        )
    elif custom_llm_provider == "fireworks_ai":
        supported_params = get_supported_openai_params(
@ -3776,7 +3778,7 @@ def get_supported_openai_params(
    elif custom_llm_provider == "fireworks_ai":
        return litellm.FireworksAIConfig().get_supported_openai_params()
    elif custom_llm_provider == "nvidia_nim":
-        return litellm.NvidiaNimConfig().get_supported_openai_params()
+        return litellm.NvidiaNimConfig().get_supported_openai_params(model=model)
    elif custom_llm_provider == "volcengine":
        return litellm.VolcEngineConfig().get_supported_openai_params(model=model)
    elif custom_llm_provider == "groq":
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@ -2022,10 +2022,10 @@
        "max_tokens": 8192,
        "max_input_tokens": 2097152,
        "max_output_tokens": 8192,
-        "input_cost_per_token": 0.00000035, 
-        "input_cost_per_token_above_128k_tokens": 0.0000007, 
-        "output_cost_per_token": 0.00000105, 
-        "output_cost_per_token_above_128k_tokens": 0.0000021, 
+        "input_cost_per_token": 0.0000035, 
+        "input_cost_per_token_above_128k_tokens": 0.000007, 
+        "output_cost_per_token": 0.0000105, 
+        "output_cost_per_token_above_128k_tokens": 0.000021, 
        "litellm_provider": "gemini",
        "mode": "chat",
        "supports_system_messages": true,
@ -2033,16 +2033,16 @@
        "supports_vision": true,
        "supports_tool_choice": true, 
        "supports_response_schema": true, 
-        "source": "https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models#foundation_models"
+        "source": "https://ai.google.dev/pricing"
    },
    "gemini/gemini-1.5-pro-latest": {
        "max_tokens": 8192,
        "max_input_tokens": 1048576,
        "max_output_tokens": 8192,
-        "input_cost_per_token": 0.00000035, 
-        "input_cost_per_token_above_128k_tokens": 0.0000007, 
+        "input_cost_per_token": 0.0000035, 
+        "input_cost_per_token_above_128k_tokens": 0.000007, 
        "output_cost_per_token": 0.00000105, 
-        "output_cost_per_token_above_128k_tokens": 0.0000021, 
+        "output_cost_per_token_above_128k_tokens": 0.000021, 
        "litellm_provider": "gemini",
        "mode": "chat",
        "supports_system_messages": true,
@ -2050,7 +2050,7 @@
        "supports_vision": true,
        "supports_tool_choice": true, 
        "supports_response_schema": true, 
-        "source": "https://ai.google.dev/models/gemini"
+        "source": "https://ai.google.dev/pricing"
    },
    "gemini/gemini-pro-vision": {
        "max_tokens": 2048,
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "1.41.7"
+version = "1.41.8"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT"
@ -90,7 +90,7 @@ requires = ["poetry-core", "wheel"]
 build-backend = "poetry.core.masonry.api"

 [tool.commitizen]
-version = "1.41.7"
+version = "1.41.8"
 version_files = [
    "pyproject.toml:^version"
 ]
--- a/requirements.txt
+++ b/requirements.txt
@ -41,7 +41,7 @@ importlib-metadata==6.8.0 # for random utils
 tokenizers==0.14.0 # for calculating usage
 click==8.1.7 # for proxy cli 
 jinja2==3.1.4 # for prompt templates
-certifi==2023.7.22 # [TODO] clean up 
+certifi==2024.7.4 # [TODO] clean up 
 aiohttp==3.9.0 # for network calls
 aioboto3==12.3.0 # for async sagemaker calls
 tenacity==8.2.3  # for retrying requests, when litellm.num_retries set