(Perf / latency improvement) improve pass through endpoint latency to ~50ms (before PR was 400ms) (#6874)

* use correct location for types

* fix types location

* perf improvement for pass through endpoints

* update lint check

* fix import

* fix ensure async clients test

* fix azure.py health check

* fix ollama
This commit is contained in:
Ishaan Jaff 2024-11-22 18:47:26 -08:00 committed by GitHub
parent 772b2f9cd2
commit d81ae45827
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 64 additions and 19 deletions

View file

@ -22,6 +22,7 @@ import litellm
from litellm._logging import verbose_proxy_logger
from litellm.integrations.custom_logger import CustomLogger
from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
from litellm.llms.custom_httpx.http_handler import get_async_httpx_client
from litellm.llms.vertex_ai_and_google_ai_studio.gemini.vertex_and_google_ai_studio_gemini import (
ModelResponseIterator,
)
@ -35,6 +36,7 @@ from litellm.proxy._types import (
)
from litellm.proxy.auth.user_api_key_auth import user_api_key_auth
from litellm.secret_managers.main import get_secret_str
from litellm.types.llms.custom_http import httpxSpecialProvider
from .streaming_handler import PassThroughStreamingHandler
from .success_handler import PassThroughEndpointLogging
@ -363,8 +365,11 @@ async def pass_through_request( # noqa: PLR0915
data=_parsed_body,
call_type="pass_through_endpoint",
)
async_client = httpx.AsyncClient(timeout=600)
async_client_obj = get_async_httpx_client(
llm_provider=httpxSpecialProvider.PassThroughEndpoint,
params={"timeout": 600},
)
async_client = async_client_obj.client
litellm_call_id = str(uuid.uuid4())