From d69552718dcd5f0a7015e851ec5226d389c0f993 Mon Sep 17 00:00:00 2001 From: Ishaan Jaff Date: Thu, 21 Nov 2024 08:34:04 -0800 Subject: [PATCH] fix latency issues on google ai studio --- .../vertex_ai_context_caching.py | 29 ++++++++++--------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/litellm/llms/vertex_ai_and_google_ai_studio/context_caching/vertex_ai_context_caching.py b/litellm/llms/vertex_ai_and_google_ai_studio/context_caching/vertex_ai_context_caching.py index e60a17052..b9be8a3bd 100644 --- a/litellm/llms/vertex_ai_and_google_ai_studio/context_caching/vertex_ai_context_caching.py +++ b/litellm/llms/vertex_ai_and_google_ai_studio/context_caching/vertex_ai_context_caching.py @@ -6,7 +6,11 @@ import httpx import litellm from litellm.caching.caching import Cache, LiteLLMCacheType from litellm.litellm_core_utils.litellm_logging import Logging -from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler +from litellm.llms.custom_httpx.http_handler import ( + AsyncHTTPHandler, + HTTPHandler, + get_async_httpx_client, +) from litellm.llms.OpenAI.openai import AllMessageValues from litellm.types.llms.vertex_ai import ( CachedContentListAllResponseBody, @@ -331,6 +335,13 @@ class ContextCachingEndpoints(VertexBase): if cached_content is not None: return messages, cached_content + cached_messages, non_cached_messages = separate_cached_messages( + messages=messages + ) + + if len(cached_messages) == 0: + return messages, None + ## AUTHORIZATION ## token, url = self._get_token_and_url_context_caching( gemini_api_key=api_key, @@ -347,22 +358,12 @@ class ContextCachingEndpoints(VertexBase): headers.update(extra_headers) if client is None or not isinstance(client, AsyncHTTPHandler): - _params = {} - if timeout is not None: - if isinstance(timeout, float) or isinstance(timeout, int): - timeout = httpx.Timeout(timeout) - _params["timeout"] = timeout - client = AsyncHTTPHandler(**_params) # type: ignore + client = get_async_httpx_client( + params={"timeout": timeout}, llm_provider=litellm.LlmProviders.VERTEX_AI + ) else: client = client - cached_messages, non_cached_messages = separate_cached_messages( - messages=messages - ) - - if len(cached_messages) == 0: - return messages, None - ## CHECK IF CACHED ALREADY generated_cache_key = local_cache_obj.get_cache_key(messages=cached_messages) google_cache_name = await self.async_check_cache(