Litellm Minor Fixes & Improvements (10/12/2024) (#6179)

* build(model_prices_and_context_window.json): add bedrock llama3.2 pricing * build(model_prices_and_context_window.json): add bedrock cross region inference pricing * Revert "(perf) move s3 logging to Batch logging + async [94% faster perf under 100 RPS on 1 litellm instance] (#6165)" This reverts commit 2a5624af47. * add azure/gpt-4o-2024-05-13 (#6174) * LiteLLM Minor Fixes & Improvements (10/10/2024) (#6158) * refactor(vertex_ai_partner_models/anthropic): refactor anthropic to use partner model logic * fix(vertex_ai/): support passing custom api base to partner models Fixes https://github.com/BerriAI/litellm/issues/4317 * fix(proxy_server.py): Fix prometheus premium user check logic * docs(prometheus.md): update quick start docs * fix(custom_llm.py): support passing dynamic api key + api base * fix(realtime_api/main.py): Add request/response logging for realtime api endpoints Closes https://github.com/BerriAI/litellm/issues/6081 * feat(openai/realtime): add openai realtime api logging Closes https://github.com/BerriAI/litellm/issues/6081 * fix(realtime_streaming.py): fix linting errors * fix(realtime_streaming.py): fix linting errors * fix: fix linting errors * fix pattern match router * Add literalai in the sidebar observability category (#6163) * fix: add literalai in the sidebar * fix: typo * update (#6160) * Feat: Add Langtrace integration (#5341) * Feat: Add Langtrace integration * add langtrace service name * fix timestamps for traces * add tests * Discard Callback + use existing otel logger * cleanup * remove print statments * remove callback * add docs * docs * add logging docs * format logging * remove emoji and add litellm proxy example * format logging * format `logging.md` * add langtrace docs to logging.md * sync conflict * docs fix * (perf) move s3 logging to Batch logging + async [94% faster perf under 100 RPS on 1 litellm instance] (#6165) * fix move s3 to use customLogger * add basic s3 logging test * add s3 to custom logger compatible * use batch logger for s3 * s3 set flush interval and batch size * fix s3 logging * add notes on s3 logging * fix s3 logging * add basic s3 logging test * fix s3 type errors * add test for sync logging on s3 * fix: fix to debug log --------- Co-authored-by: Ishaan Jaff <ishaanjaffer0324@gmail.com> Co-authored-by: Willy Douhard <willy.douhard@gmail.com> Co-authored-by: yujonglee <yujonglee.dev@gmail.com> Co-authored-by: Ali Waleed <ali@scale3labs.com> * docs(custom_llm_server.md): update doc on passing custom params * fix(pass_through_endpoints.py): don't require headers Fixes https://github.com/BerriAI/litellm/issues/6128 * feat(utils.py): add support for caching rerank endpoints Closes https://github.com/BerriAI/litellm/issues/6144 * feat(litellm_logging.py'): add response headers for failed requests Closes https://github.com/BerriAI/litellm/issues/6159 --------- Co-authored-by: Ishaan Jaff <ishaanjaffer0324@gmail.com> Co-authored-by: Willy Douhard <willy.douhard@gmail.com> Co-authored-by: yujonglee <yujonglee.dev@gmail.com> Co-authored-by: Ali Waleed <ali@scale3labs.com>
2025-04-25 18:54:30 +00:00 · 2024-10-12 11:48:34 -07:00 · 2024-10-12 11:48:34 -07:00 · 2acb0c0675
commit 2acb0c0675
parent 2cb65b450d
18 changed files with 533 additions and 82 deletions
--- a/litellm/caching.py
+++ b/litellm/caching.py
@ -20,13 +20,13 @@ from datetime import timedelta
 from enum import Enum
 from typing import Any, List, Literal, Optional, Tuple, Union

-from openai._models import BaseModel as OpenAIObject
+from pydantic import BaseModel

 import litellm
 from litellm._logging import verbose_logger
 from litellm.litellm_core_utils.core_helpers import _get_parent_otel_span_from_kwargs
 from litellm.types.services import ServiceLoggerPayload, ServiceTypes
-from litellm.types.utils import all_litellm_params
+from litellm.types.utils import CachingSupportedCallTypes, all_litellm_params


 def print_verbose(print_statement):
@ -2139,20 +2139,7 @@ class Cache:
        default_in_memory_ttl: Optional[float] = None,
        default_in_redis_ttl: Optional[float] = None,
        similarity_threshold: Optional[float] = None,
-        supported_call_types: Optional[
-            List[
-                Literal[
-                    "completion",
-                    "acompletion",
-                    "embedding",
-                    "aembedding",
-                    "atranscription",
-                    "transcription",
-                    "atext_completion",
-                    "text_completion",
-                ]
-            ]
-        ] = [
+        supported_call_types: Optional[List[CachingSupportedCallTypes]] = [
            "completion",
            "acompletion",
            "embedding",
@ -2161,6 +2148,8 @@ class Cache:
            "transcription",
            "atext_completion",
            "text_completion",
+            "arerank",
+            "rerank",
        ],
        # s3 Bucket, boto3 configuration
        s3_bucket_name: Optional[str] = None,
@ -2353,9 +2342,20 @@ class Cache:
            "file",
            "language",
        ]
+        rerank_only_kwargs = [
+            "top_n",
+            "rank_fields",
+            "return_documents",
+            "max_chunks_per_doc",
+            "documents",
+            "query",
+        ]
        # combined_kwargs - NEEDS to be ordered across get_cache_key(). Do not use a set()
        combined_kwargs = (
-            completion_kwargs + embedding_only_kwargs + transcription_only_kwargs
+            completion_kwargs
+            + embedding_only_kwargs
+            + transcription_only_kwargs
+            + rerank_only_kwargs
        )
        litellm_param_kwargs = all_litellm_params
        for param in kwargs:
@ -2557,7 +2557,7 @@ class Cache:
            else:
                cache_key = self.get_cache_key(*args, **kwargs)
            if cache_key is not None:
-                if isinstance(result, OpenAIObject):
+                if isinstance(result, BaseModel):
                    result = result.model_dump_json()

                ## DEFAULT TTL ##
@ -2778,20 +2778,7 @@ def enable_cache(
    host: Optional[str] = None,
    port: Optional[str] = None,
    password: Optional[str] = None,
-    supported_call_types: Optional[
-        List[
-            Literal[
-                "completion",
-                "acompletion",
-                "embedding",
-                "aembedding",
-                "atranscription",
-                "transcription",
-                "atext_completion",
-                "text_completion",
-            ]
-        ]
-    ] = [
+    supported_call_types: Optional[List[CachingSupportedCallTypes]] = [
        "completion",
        "acompletion",
        "embedding",
@ -2800,6 +2787,8 @@ def enable_cache(
        "transcription",
        "atext_completion",
        "text_completion",
+        "arerank",
+        "rerank",
    ],
    **kwargs,
 ):
@ -2847,20 +2836,7 @@ def update_cache(
    host: Optional[str] = None,
    port: Optional[str] = None,
    password: Optional[str] = None,
-    supported_call_types: Optional[
-        List[
-            Literal[
-                "completion",
-                "acompletion",
-                "embedding",
-                "aembedding",
-                "atranscription",
-                "transcription",
-                "atext_completion",
-                "text_completion",
-            ]
-        ]
-    ] = [
+    supported_call_types: Optional[List[CachingSupportedCallTypes]] = [
        "completion",
        "acompletion",
        "embedding",
@ -2869,6 +2845,8 @@ def update_cache(
        "transcription",
        "atext_completion",
        "text_completion",
+        "arerank",
+        "rerank",
    ],
    **kwargs,
 ):