(feat) use @google-cloud/vertexai js sdk with litellm (#6873)

* stash gemini JS test * add vertex js sdj example * handle vertex pass through separately * tes vertex JS sdk * fix vertex_proxy_route * use PassThroughStreamingHandler * fix PassThroughStreamingHandler * use common _create_vertex_response_logging_payload_for_generate_content * test vertex js * add working vertex jest tests * move basic bass through test * use good name for test * test vertex * test_chunk_processor_yields_raw_bytes * unit tests for streaming * test_convert_raw_bytes_to_str_lines * run unit tests 1st * simplify local * docs add usage example for js * use get_litellm_virtual_key * add unit tests for vertex pass through
2024-11-22 16:50:10 -08:00 · 2024-11-22 16:50:10 -08:00 · b2b3e40d13
commit b2b3e40d13
parent 5930c42e74
14 changed files with 680 additions and 89 deletions
--- a/litellm/proxy/pass_through_endpoints/llm_provider_handlers/anthropic_passthrough_logging_handler.py
+++ b/litellm/proxy/pass_through_endpoints/llm_provider_handlers/anthropic_passthrough_logging_handler.py
@ -100,7 +100,7 @@ class AnthropicPassthroughLoggingHandler:
        kwargs["response_cost"] = response_cost
        kwargs["model"] = model

-        # Make standard logging object for Vertex AI
+        # Make standard logging object for Anthropic
        standard_logging_object = get_standard_logging_object_payload(
            kwargs=kwargs,
            init_response_obj=litellm_model_response,
--- a/litellm/proxy/pass_through_endpoints/llm_provider_handlers/vertex_passthrough_logging_handler.py
+++ b/litellm/proxy/pass_through_endpoints/llm_provider_handlers/vertex_passthrough_logging_handler.py
@ -56,8 +56,14 @@ class VertexPassthroughLoggingHandler:
                    encoding=None,
                )
            )
-            logging_obj.model = litellm_model_response.model or model
-            logging_obj.model_call_details["model"] = logging_obj.model
+            kwargs = VertexPassthroughLoggingHandler._create_vertex_response_logging_payload_for_generate_content(
+                litellm_model_response=litellm_model_response,
+                model=model,
+                kwargs=kwargs,
+                start_time=start_time,
+                end_time=end_time,
+                logging_obj=logging_obj,
+            )

            await logging_obj.async_success_handler(
                result=litellm_model_response,
@ -147,6 +153,14 @@ class VertexPassthroughLoggingHandler:
                "Unable to build complete streaming response for Vertex passthrough endpoint, not logging..."
            )
            return
+        kwargs = VertexPassthroughLoggingHandler._create_vertex_response_logging_payload_for_generate_content(
+            litellm_model_response=complete_streaming_response,
+            model=model,
+            kwargs=kwargs,
+            start_time=start_time,
+            end_time=end_time,
+            logging_obj=litellm_logging_obj,
+        )
        await litellm_logging_obj.async_success_handler(
            result=complete_streaming_response,
            start_time=start_time,
@ -193,3 +207,47 @@ class VertexPassthroughLoggingHandler:
        if match:
            return match.group(1)
        return "unknown"
+
+    @staticmethod
+    def _create_vertex_response_logging_payload_for_generate_content(
+        litellm_model_response: Union[
+            litellm.ModelResponse, litellm.TextCompletionResponse
+        ],
+        model: str,
+        kwargs: dict,
+        start_time: datetime,
+        end_time: datetime,
+        logging_obj: LiteLLMLoggingObj,
+    ):
+        """
+        Create the standard logging object for Vertex passthrough generateContent (streaming and non-streaming)
+
+        """
+        response_cost = litellm.completion_cost(
+            completion_response=litellm_model_response,
+            model=model,
+        )
+        kwargs["response_cost"] = response_cost
+        kwargs["model"] = model
+
+        # Make standard logging object for Vertex AI
+        standard_logging_object = get_standard_logging_object_payload(
+            kwargs=kwargs,
+            init_response_obj=litellm_model_response,
+            start_time=start_time,
+            end_time=end_time,
+            logging_obj=logging_obj,
+            status="success",
+        )
+
+        # pretty print standard logging object
+        verbose_proxy_logger.debug(
+            "standard_logging_object= %s", json.dumps(standard_logging_object, indent=4)
+        )
+        kwargs["standard_logging_object"] = standard_logging_object
+
+        # set litellm_call_id to logging response object
+        litellm_model_response.id = logging_obj.litellm_call_id
+        logging_obj.model = litellm_model_response.model or model
+        logging_obj.model_call_details["model"] = logging_obj.model
+        return kwargs
--- a/litellm/proxy/pass_through_endpoints/pass_through_endpoints.py
+++ b/litellm/proxy/pass_through_endpoints/pass_through_endpoints.py
@ -36,7 +36,7 @@ from litellm.proxy._types import (
 from litellm.proxy.auth.user_api_key_auth import user_api_key_auth
 from litellm.secret_managers.main import get_secret_str

-from .streaming_handler import chunk_processor
+from .streaming_handler import PassThroughStreamingHandler
 from .success_handler import PassThroughEndpointLogging
 from .types import EndpointType, PassthroughStandardLoggingPayload

@ -448,7 +448,7 @@ async def pass_through_request(  # noqa: PLR0915
                )

            return StreamingResponse(
-                chunk_processor(
+                PassThroughStreamingHandler.chunk_processor(
                    response=response,
                    request_body=_parsed_body,
                    litellm_logging_obj=logging_obj,
@ -491,7 +491,7 @@ async def pass_through_request(  # noqa: PLR0915
                )

            return StreamingResponse(
-                chunk_processor(
+                PassThroughStreamingHandler.chunk_processor(
                    response=response,
                    request_body=_parsed_body,
                    litellm_logging_obj=logging_obj,
--- a/litellm/proxy/pass_through_endpoints/streaming_handler.py
+++ b/litellm/proxy/pass_through_endpoints/streaming_handler.py
@ -27,93 +27,107 @@ from .success_handler import PassThroughEndpointLogging
 from .types import EndpointType


-async def chunk_processor(
-    response: httpx.Response,
-    request_body: Optional[dict],
-    litellm_logging_obj: LiteLLMLoggingObj,
-    endpoint_type: EndpointType,
-    start_time: datetime,
-    passthrough_success_handler_obj: PassThroughEndpointLogging,
-    url_route: str,
-):
-    """
-    - Yields chunks from the response
-    - Collect non-empty chunks for post-processing (logging)
-    """
-    collected_chunks: List[str] = []  # List to store all chunks
-    try:
-        async for chunk in response.aiter_lines():
-            verbose_proxy_logger.debug(f"Processing chunk: {chunk}")
-            if not chunk:
-                continue
+class PassThroughStreamingHandler:

-            # Handle SSE format - pass through the raw SSE format
-            if isinstance(chunk, bytes):
-                chunk = chunk.decode("utf-8")
+    @staticmethod
+    async def chunk_processor(
+        response: httpx.Response,
+        request_body: Optional[dict],
+        litellm_logging_obj: LiteLLMLoggingObj,
+        endpoint_type: EndpointType,
+        start_time: datetime,
+        passthrough_success_handler_obj: PassThroughEndpointLogging,
+        url_route: str,
+    ):
+        """
+        - Yields chunks from the response
+        - Collect non-empty chunks for post-processing (logging)
+        """
+        try:
+            raw_bytes: List[bytes] = []
+            async for chunk in response.aiter_bytes():
+                raw_bytes.append(chunk)
+                yield chunk

-            # Store the chunk for post-processing
-            if chunk.strip():  # Only store non-empty chunks
-                collected_chunks.append(chunk)
-                yield f"{chunk}\n"
+            # After all chunks are processed, handle post-processing
+            end_time = datetime.now()

-        # After all chunks are processed, handle post-processing
-        end_time = datetime.now()
+            await PassThroughStreamingHandler._route_streaming_logging_to_handler(
+                litellm_logging_obj=litellm_logging_obj,
+                passthrough_success_handler_obj=passthrough_success_handler_obj,
+                url_route=url_route,
+                request_body=request_body or {},
+                endpoint_type=endpoint_type,
+                start_time=start_time,
+                raw_bytes=raw_bytes,
+                end_time=end_time,
+            )
+        except Exception as e:
+            verbose_proxy_logger.error(f"Error in chunk_processor: {str(e)}")
+            raise

-        await _route_streaming_logging_to_handler(
-            litellm_logging_obj=litellm_logging_obj,
-            passthrough_success_handler_obj=passthrough_success_handler_obj,
-            url_route=url_route,
-            request_body=request_body or {},
-            endpoint_type=endpoint_type,
-            start_time=start_time,
-            all_chunks=collected_chunks,
-            end_time=end_time,
+    @staticmethod
+    async def _route_streaming_logging_to_handler(
+        litellm_logging_obj: LiteLLMLoggingObj,
+        passthrough_success_handler_obj: PassThroughEndpointLogging,
+        url_route: str,
+        request_body: dict,
+        endpoint_type: EndpointType,
+        start_time: datetime,
+        raw_bytes: List[bytes],
+        end_time: datetime,
+    ):
+        """
+        Route the logging for the collected chunks to the appropriate handler
+
+        Supported endpoint types:
+        - Anthropic
+        - Vertex AI
+        """
+        all_chunks = PassThroughStreamingHandler._convert_raw_bytes_to_str_lines(
+            raw_bytes
        )
+        if endpoint_type == EndpointType.ANTHROPIC:
+            await AnthropicPassthroughLoggingHandler._handle_logging_anthropic_collected_chunks(
+                litellm_logging_obj=litellm_logging_obj,
+                passthrough_success_handler_obj=passthrough_success_handler_obj,
+                url_route=url_route,
+                request_body=request_body,
+                endpoint_type=endpoint_type,
+                start_time=start_time,
+                all_chunks=all_chunks,
+                end_time=end_time,
+            )
+        elif endpoint_type == EndpointType.VERTEX_AI:
+            await VertexPassthroughLoggingHandler._handle_logging_vertex_collected_chunks(
+                litellm_logging_obj=litellm_logging_obj,
+                passthrough_success_handler_obj=passthrough_success_handler_obj,
+                url_route=url_route,
+                request_body=request_body,
+                endpoint_type=endpoint_type,
+                start_time=start_time,
+                all_chunks=all_chunks,
+                end_time=end_time,
+            )
+        elif endpoint_type == EndpointType.GENERIC:
+            # No logging is supported for generic streaming endpoints
+            pass

-    except Exception as e:
-        verbose_proxy_logger.error(f"Error in chunk_processor: {str(e)}")
-        raise
+    @staticmethod
+    def _convert_raw_bytes_to_str_lines(raw_bytes: List[bytes]) -> List[str]:
+        """
+        Converts a list of raw bytes into a list of string lines, similar to aiter_lines()

+        Args:
+            raw_bytes: List of bytes chunks from aiter.bytes()

-async def _route_streaming_logging_to_handler(
-    litellm_logging_obj: LiteLLMLoggingObj,
-    passthrough_success_handler_obj: PassThroughEndpointLogging,
-    url_route: str,
-    request_body: dict,
-    endpoint_type: EndpointType,
-    start_time: datetime,
-    all_chunks: List[str],
-    end_time: datetime,
-):
-    """
-    Route the logging for the collected chunks to the appropriate handler
+        Returns:
+            List of string lines, with each line being a complete data: {} chunk
+        """
+        # Combine all bytes and decode to string
+        combined_str = b"".join(raw_bytes).decode("utf-8")

-    Supported endpoint types:
-    - Anthropic
-    - Vertex AI
-    """
-    if endpoint_type == EndpointType.ANTHROPIC:
-        await AnthropicPassthroughLoggingHandler._handle_logging_anthropic_collected_chunks(
-            litellm_logging_obj=litellm_logging_obj,
-            passthrough_success_handler_obj=passthrough_success_handler_obj,
-            url_route=url_route,
-            request_body=request_body,
-            endpoint_type=endpoint_type,
-            start_time=start_time,
-            all_chunks=all_chunks,
-            end_time=end_time,
-        )
-    elif endpoint_type == EndpointType.VERTEX_AI:
-        await VertexPassthroughLoggingHandler._handle_logging_vertex_collected_chunks(
-            litellm_logging_obj=litellm_logging_obj,
-            passthrough_success_handler_obj=passthrough_success_handler_obj,
-            url_route=url_route,
-            request_body=request_body,
-            endpoint_type=endpoint_type,
-            start_time=start_time,
-            all_chunks=all_chunks,
-            end_time=end_time,
-        )
-    elif endpoint_type == EndpointType.GENERIC:
-        # No logging is supported for generic streaming endpoints
-        pass
+        # Split by newlines and filter out empty lines
+        lines = [line.strip() for line in combined_str.split("\n") if line.strip()]
+
+        return lines
--- a/litellm/proxy/vertex_ai_endpoints/vertex_endpoints.py
+++ b/litellm/proxy/vertex_ai_endpoints/vertex_endpoints.py
@ -119,7 +119,6 @@ async def vertex_proxy_route(
    endpoint: str,
    request: Request,
    fastapi_response: Response,
-    user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
 ):
    encoded_endpoint = httpx.URL(endpoint).path

@ -127,6 +126,11 @@ async def vertex_proxy_route(

    verbose_proxy_logger.debug("requested endpoint %s", endpoint)
    headers: dict = {}
+    api_key_to_use = get_litellm_virtual_key(request=request)
+    user_api_key_dict = await user_api_key_auth(
+        request=request,
+        api_key=api_key_to_use,
+    )

    vertex_project = None
    vertex_location = None
@ -214,3 +218,18 @@ async def vertex_proxy_route(
    )

    return received_value
+
+
+def get_litellm_virtual_key(request: Request) -> str:
+    """
+    Extract and format API key from request headers.
+    Prioritizes x-litellm-api-key over Authorization header.
+
+
+    Vertex JS SDK uses `Authorization` header, we use `x-litellm-api-key` to pass litellm virtual key
+
+    """
+    litellm_api_key = request.headers.get("x-litellm-api-key")
+    if litellm_api_key:
+        return f"Bearer {litellm_api_key}"
+    return request.headers.get("Authorization", "")