fix latency issues on google ai studio (#6852)

2024-11-21 19:02:08 -08:00 · 2024-11-21 19:02:08 -08:00 · 71ebf47cef
commit 71ebf47cef
parent 2903fd4164
1 changed files with 15 additions and 14 deletions
--- a/litellm/llms/vertex_ai_and_google_ai_studio/context_caching/vertex_ai_context_caching.py
+++ b/litellm/llms/vertex_ai_and_google_ai_studio/context_caching/vertex_ai_context_caching.py
@ -6,7 +6,11 @@ import httpx
 import litellm
 from litellm.caching.caching import Cache, LiteLLMCacheType
 from litellm.litellm_core_utils.litellm_logging import Logging
-from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
+from litellm.llms.custom_httpx.http_handler import (
+    AsyncHTTPHandler,
+    HTTPHandler,
+    get_async_httpx_client,
+)
 from litellm.llms.OpenAI.openai import AllMessageValues
 from litellm.types.llms.vertex_ai import (
    CachedContentListAllResponseBody,
@ -331,6 +335,13 @@ class ContextCachingEndpoints(VertexBase):
        if cached_content is not None:
            return messages, cached_content

+        cached_messages, non_cached_messages = separate_cached_messages(
+            messages=messages
+        )
+
+        if len(cached_messages) == 0:
+            return messages, None
+
        ## AUTHORIZATION ##
        token, url = self._get_token_and_url_context_caching(
            gemini_api_key=api_key,
@ -347,22 +358,12 @@ class ContextCachingEndpoints(VertexBase):
            headers.update(extra_headers)

        if client is None or not isinstance(client, AsyncHTTPHandler):
-            _params = {}
-            if timeout is not None:
-                if isinstance(timeout, float) or isinstance(timeout, int):
-                    timeout = httpx.Timeout(timeout)
-                _params["timeout"] = timeout
-            client = AsyncHTTPHandler(**_params)  # type: ignore
+            client = get_async_httpx_client(
+                params={"timeout": timeout}, llm_provider=litellm.LlmProviders.VERTEX_AI
+            )
        else:
            client = client

-        cached_messages, non_cached_messages = separate_cached_messages(
-            messages=messages
-        )
-
-        if len(cached_messages) == 0:
-            return messages, None
-
        ## CHECK IF CACHED ALREADY
        generated_cache_key = local_cache_obj.get_cache_key(messages=cached_messages)
        google_cache_name = await self.async_check_cache(