fix vertex use async func to set auth creds

2024-09-10 16:12:18 -07:00 · 2024-09-10 16:12:18 -07:00 · 1c6f8b1be2
commit 1c6f8b1be2
parent 26ae86e59b
8 changed files with 420 additions and 230 deletions
--- a/litellm/integrations/gcs_bucket_base.py
+++ b/litellm/integrations/gcs_bucket_base.py
@ -34,10 +34,18 @@ class GCSBucketBase(CustomLogger):
    async def construct_request_headers(self) -> Dict[str, str]:
        from litellm import vertex_chat_completion

+        _auth_header, vertex_project = (
+            await vertex_chat_completion._ensure_access_token_async(
+                credentials=self.path_service_account_json,
+                project_id=None,
+            )
+        )
+
        auth_header, _ = vertex_chat_completion._get_token_and_url(
            model="gcs-bucket",
+            auth_header=_auth_header,
            vertex_credentials=self.path_service_account_json,
-            vertex_project=None,
+            vertex_project=vertex_project,
            vertex_location=None,
            gemini_api_key=None,
            stream=None,
@ -55,10 +63,16 @@ class GCSBucketBase(CustomLogger):
    def sync_construct_request_headers(self) -> Dict[str, str]:
        from litellm import vertex_chat_completion

+        _auth_header, vertex_project = vertex_chat_completion._ensure_access_token(
+            credentials=self.path_service_account_json,
+            project_id=None,
+        )
+
        auth_header, _ = vertex_chat_completion._get_token_and_url(
            model="gcs-bucket",
+            auth_header=_auth_header,
            vertex_credentials=self.path_service_account_json,
-            vertex_project=None,
+            vertex_project=vertex_project,
            vertex_location=None,
            gemini_api_key=None,
            stream=None,
--- a/litellm/llms/fine_tuning_apis/vertex_ai.py
+++ b/litellm/llms/fine_tuning_apis/vertex_ai.py
@ -185,8 +185,14 @@ class VertexFineTuningAPI(VertexLLM):
            "creating fine tuning job, args= %s", create_fine_tuning_job_data
        )

+        _auth_header, vertex_project = self._ensure_access_token(
+            credentials=vertex_credentials,
+            project_id=vertex_project,
+        )
+
        auth_header, _ = self._get_token_and_url(
            model="",
+            auth_header=_auth_header,
            gemini_api_key=None,
            vertex_credentials=vertex_credentials,
            vertex_project=vertex_project,
@ -251,8 +257,14 @@ class VertexFineTuningAPI(VertexLLM):
        vertex_credentials: str,
        request_route: str,
    ):
+
+        _auth_header, vertex_project = await self._ensure_access_token_async(
+            credentials=vertex_credentials,
+            project_id=vertex_project,
+        )
        auth_header, _ = self._get_token_and_url(
            model="",
+            auth_header=_auth_header,
            gemini_api_key=None,
            vertex_credentials=vertex_credentials,
            vertex_project=vertex_project,
--- a/litellm/llms/vertex_ai_and_google_ai_studio/gemini/vertex_and_google_ai_studio_gemini.py
+++ b/litellm/llms/vertex_ai_and_google_ai_studio/gemini/vertex_and_google_ai_studio_gemini.py
@ -72,17 +72,13 @@ from ..common_utils import (
    all_gemini_url_modes,
    get_supports_system_message,
 )
+from ..vertex_llm_base import VertexBase
 from .transformation import (
    async_transform_request_body,
    set_headers,
    sync_transform_request_body,
 )

-if TYPE_CHECKING:
-    from google.auth.credentials import Credentials as GoogleCredentialsObject
-else:
-    GoogleCredentialsObject = Any
-

 class VertexAIConfig:
    """
@ -821,14 +817,9 @@ def make_sync_call(
    return completion_stream


-class VertexLLM(BaseLLM):
+class VertexLLM(VertexBase):
    def __init__(self) -> None:
        super().__init__()
-        self.access_token: Optional[str] = None
-        self.refresh_token: Optional[str] = None
-        self._credentials: Optional[GoogleCredentialsObject] = None
-        self.project_id: Optional[str] = None
-        self.async_handler: Optional[AsyncHTTPHandler] = None

    def _process_response(
        self,
@ -1057,201 +1048,13 @@ class VertexLLM(BaseLLM):

        return model_response

-    def get_vertex_region(self, vertex_region: Optional[str]) -> str:
-        return vertex_region or "us-central1"
-
-    def load_auth(
-        self, credentials: Optional[str], project_id: Optional[str]
-    ) -> Tuple[Any, str]:
-        import google.auth as google_auth
-        from google.auth import identity_pool
-        from google.auth.credentials import Credentials  # type: ignore[import-untyped]
-        from google.auth.transport.requests import (
-            Request,  # type: ignore[import-untyped]
-        )
-
-        if credentials is not None and isinstance(credentials, str):
-            import google.oauth2.service_account
-
-            verbose_logger.debug(
-                "Vertex: Loading vertex credentials from %s", credentials
-            )
-            verbose_logger.debug(
-                "Vertex: checking if credentials is a valid path, os.path.exists(%s)=%s, current dir %s",
-                credentials,
-                os.path.exists(credentials),
-                os.getcwd(),
-            )
-
-            try:
-                if os.path.exists(credentials):
-                    json_obj = json.load(open(credentials))
-                else:
-                    json_obj = json.loads(credentials)
-            except Exception:
-                raise Exception(
-                    "Unable to load vertex credentials from environment. Got={}".format(
-                        credentials
-                    )
-                )
-
-            # Check if the JSON object contains Workload Identity Federation configuration
-            if "type" in json_obj and json_obj["type"] == "external_account":
-                creds = identity_pool.Credentials.from_info(json_obj)
-            else:
-                creds = (
-                    google.oauth2.service_account.Credentials.from_service_account_info(
-                        json_obj,
-                        scopes=["https://www.googleapis.com/auth/cloud-platform"],
-                    )
-                )
-
-            if project_id is None:
-                project_id = creds.project_id
-        else:
-            creds, creds_project_id = google_auth.default(
-                quota_project_id=project_id,
-                scopes=["https://www.googleapis.com/auth/cloud-platform"],
-            )
-            if project_id is None:
-                project_id = creds_project_id
-
-        creds.refresh(Request())
-
-        if not project_id:
-            raise ValueError("Could not resolve project_id")
-
-        if not isinstance(project_id, str):
-            raise TypeError(
-                f"Expected project_id to be a str but got {type(project_id)}"
-            )
-
-        return creds, project_id
-
-    def refresh_auth(self, credentials: Any) -> None:
-        from google.auth.transport.requests import (
-            Request,  # type: ignore[import-untyped]
-        )
-
-        credentials.refresh(Request())
-
-    def _ensure_access_token(
-        self, credentials: Optional[str], project_id: Optional[str]
-    ) -> Tuple[str, str]:
-        """
-        Returns auth token and project id
-        """
-        if self.access_token is not None:
-            if project_id is not None:
-                return self.access_token, project_id
-            elif self.project_id is not None:
-                return self.access_token, self.project_id
-
-        if not self._credentials:
-            self._credentials, cred_project_id = self.load_auth(
-                credentials=credentials, project_id=project_id
-            )
-            if not self.project_id:
-                self.project_id = project_id or cred_project_id
-        else:
-            if self._credentials.expired or not self._credentials.token:
-                self.refresh_auth(self._credentials)
-
-            if not self.project_id:
-                self.project_id = self._credentials.quota_project_id
-
-        if not self.project_id:
-            raise ValueError("Could not resolve project_id")
-
-        if not self._credentials or not self._credentials.token:
-            raise RuntimeError("Could not resolve API token from the environment")
-
-        return self._credentials.token, project_id or self.project_id
-
-    def is_using_v1beta1_features(self, optional_params: dict) -> bool:
-        """
-        VertexAI only supports ContextCaching on v1beta1
-
-        use this helper to decide if request should be sent to v1 or v1beta1
-
-        Returns v1beta1 if context caching is enabled
-        Returns v1 in all other cases
-        """
-        if "cached_content" in optional_params:
-            return True
-        if "CachedContent" in optional_params:
-            return True
-        return False
-
-    def _get_token_and_url(
-        self,
-        model: str,
-        gemini_api_key: Optional[str],
-        vertex_project: Optional[str],
-        vertex_location: Optional[str],
-        vertex_credentials: Optional[str],
-        stream: Optional[bool],
-        custom_llm_provider: Literal["vertex_ai", "vertex_ai_beta", "gemini"],
-        api_base: Optional[str],
-        should_use_v1beta1_features: Optional[bool] = False,
-        mode: all_gemini_url_modes = "chat",
-    ) -> Tuple[Optional[str], str]:
-        """
-        Internal function. Returns the token and url for the call.
-
-        Handles logic if it's google ai studio vs. vertex ai.
-
-        Returns
-            token, url
-        """
-        if custom_llm_provider == "gemini":
-            auth_header = None
-            url, endpoint = _get_gemini_url(
-                mode=mode,
-                model=model,
-                stream=stream,
-                gemini_api_key=gemini_api_key,
-            )
-        else:
-            auth_header, vertex_project = self._ensure_access_token(
-                credentials=vertex_credentials, project_id=vertex_project
-            )
-            vertex_location = self.get_vertex_region(vertex_region=vertex_location)
-
-            ### SET RUNTIME ENDPOINT ###
-            version: Literal["v1beta1", "v1"] = (
-                "v1beta1" if should_use_v1beta1_features is True else "v1"
-            )
-            url, endpoint = _get_vertex_url(
-                mode=mode,
-                model=model,
-                stream=stream,
-                vertex_project=vertex_project,
-                vertex_location=vertex_location,
-                vertex_api_version=version,
-            )
-
-        if (
-            api_base is not None
-        ):  # for cloudflare ai gateway - https://github.com/BerriAI/litellm/issues/4317
-            if custom_llm_provider == "gemini":
-                url = "{}:{}".format(api_base, endpoint)
-                auth_header = (
-                    gemini_api_key  # cloudflare expects api key as bearer token
-                )
-            else:
-                url = "{}:{}".format(api_base, endpoint)
-
-            if stream is True:
-                url = url + "?alt=sse"
-
-        return auth_header, url
-
    async def async_streaming(
        self,
        model: str,
+        custom_llm_provider: Literal[
+            "vertex_ai", "vertex_ai_beta", "gemini"
+        ],  # if it's vertex_ai or gemini (google ai studio)
        messages: list,
-        api_base: str,
        model_response: ModelResponse,
        print_verbose: Callable,
        data: dict,
@ -1262,11 +1065,49 @@ class VertexLLM(BaseLLM):
        optional_params: dict,
        litellm_params=None,
        logger_fn=None,
-        headers={},
+        api_base: Optional[str] = None,
        client: Optional[AsyncHTTPHandler] = None,
+        vertex_project: Optional[str] = None,
+        vertex_location: Optional[str] = None,
+        vertex_credentials: Optional[str] = None,
+        extra_headers: Optional[dict] = None,
    ) -> CustomStreamWrapper:
        request_body = await async_transform_request_body(**data)  # type: ignore

+        should_use_v1beta1_features = self.is_using_v1beta1_features(
+            optional_params=optional_params
+        )
+
+        _auth_header, vertex_project = await self._ensure_access_token_async(
+            credentials=vertex_credentials, project_id=vertex_project
+        )
+
+        auth_header, api_base = self._get_token_and_url(
+            model=model,
+            gemini_api_key=None,
+            auth_header=_auth_header,
+            vertex_project=vertex_project,
+            vertex_location=vertex_location,
+            vertex_credentials=vertex_credentials,
+            stream=stream,
+            custom_llm_provider=custom_llm_provider,
+            api_base=api_base,
+            should_use_v1beta1_features=should_use_v1beta1_features,
+        )
+
+        headers = set_headers(auth_header=auth_header, extra_headers=extra_headers)
+
+        ## LOGGING
+        logging_obj.pre_call(
+            input=messages,
+            api_key="",
+            additional_args={
+                "complete_input_dict": data,
+                "api_base": api_base,
+                "headers": headers,
+            },
+        )
+
        request_body_str = json.dumps(request_body)
        streaming_response = CustomStreamWrapper(
            completion_stream=None,
@ -1290,21 +1131,50 @@ class VertexLLM(BaseLLM):
        self,
        model: str,
        messages: list,
-        api_base: str,
        model_response: ModelResponse,
        print_verbose: Callable,
        data: dict,
+        custom_llm_provider: Literal[
+            "vertex_ai", "vertex_ai_beta", "gemini"
+        ],  # if it's vertex_ai or gemini (google ai studio)
        timeout: Optional[Union[float, httpx.Timeout]],
        encoding,
        logging_obj,
        stream,
        optional_params: dict,
        litellm_params: dict,
-        headers: dict,
        logger_fn=None,
+        api_base: Optional[str] = None,
        client: Optional[AsyncHTTPHandler] = None,
+        vertex_project: Optional[str] = None,
+        vertex_location: Optional[str] = None,
+        vertex_credentials: Optional[str] = None,
+        extra_headers: Optional[dict] = None,
    ) -> Union[ModelResponse, CustomStreamWrapper]:

+        should_use_v1beta1_features = self.is_using_v1beta1_features(
+            optional_params=optional_params
+        )
+
+        _auth_header, vertex_project = await self._ensure_access_token_async(
+            credentials=vertex_credentials, project_id=vertex_project
+        )
+
+        auth_header, api_base = self._get_token_and_url(
+            model=model,
+            gemini_api_key=None,
+            auth_header=_auth_header,
+            vertex_project=vertex_project,
+            vertex_location=vertex_location,
+            vertex_credentials=vertex_credentials,
+            stream=stream,
+            custom_llm_provider=custom_llm_provider,
+            api_base=api_base,
+            should_use_v1beta1_features=should_use_v1beta1_features,
+        )
+
+        headers = set_headers(auth_header=auth_header, extra_headers=extra_headers)
+
        request_body = await async_transform_request_body(**data)  # type: ignore
        _async_client_params = {}
        if timeout:
@ -1373,22 +1243,6 @@ class VertexLLM(BaseLLM):
    ) -> Union[ModelResponse, CustomStreamWrapper]:
        stream: Optional[bool] = optional_params.pop("stream", None)  # type: ignore

-        should_use_v1beta1_features = self.is_using_v1beta1_features(
-            optional_params=optional_params
-        )
-
-        auth_header, url = self._get_token_and_url(
-            model=model,
-            gemini_api_key=gemini_api_key,
-            vertex_project=vertex_project,
-            vertex_location=vertex_location,
-            vertex_credentials=vertex_credentials,
-            stream=stream,
-            custom_llm_provider=custom_llm_provider,
-            api_base=api_base,
-            should_use_v1beta1_features=should_use_v1beta1_features,
-        )
-
        transform_request_params = {
            "gemini_api_key": gemini_api_key,
            "messages": messages,
@ -1403,8 +1257,6 @@ class VertexLLM(BaseLLM):
            "litellm_params": litellm_params,
        }

-        headers = set_headers(auth_header=auth_header, extra_headers=extra_headers)
-
        ### ROUTING (ASYNC, STREAMING, SYNC)
        if acompletion:
            ### ASYNC STREAMING
@ -1412,7 +1264,7 @@ class VertexLLM(BaseLLM):
                return self.async_streaming(
                    model=model,
                    messages=messages,
-                    api_base=url,
+                    api_base=api_base,
                    model_response=model_response,
                    print_verbose=print_verbose,
                    encoding=encoding,
@ -1424,14 +1276,18 @@ class VertexLLM(BaseLLM):
                    timeout=timeout,
                    client=client,  # type: ignore
                    data=transform_request_params,
-                    headers=headers,
+                    vertex_project=vertex_project,
+                    vertex_location=vertex_location,
+                    vertex_credentials=vertex_credentials,
+                    custom_llm_provider=custom_llm_provider,
+                    extra_headers=extra_headers,
                )
            ### ASYNC COMPLETION
            return self.async_completion(
                model=model,
                messages=messages,
                data=transform_request_params,  # type: ignore
-                api_base=url,
+                api_base=api_base,
                model_response=model_response,
                print_verbose=print_verbose,
                encoding=encoding,
@ -1442,10 +1298,35 @@ class VertexLLM(BaseLLM):
                logger_fn=logger_fn,
                timeout=timeout,
                client=client,  # type: ignore
-                headers=headers,
+                vertex_project=vertex_project,
+                vertex_location=vertex_location,
+                vertex_credentials=vertex_credentials,
+                custom_llm_provider=custom_llm_provider,
+                extra_headers=extra_headers,
            )

-        ## SYNC STREAMING CALL ##
+        should_use_v1beta1_features = self.is_using_v1beta1_features(
+            optional_params=optional_params
+        )
+
+        _auth_header, vertex_project = self._ensure_access_token(
+            credentials=vertex_credentials, project_id=vertex_project
+        )
+
+        auth_header, url = self._get_token_and_url(
+            model=model,
+            gemini_api_key=gemini_api_key,
+            auth_header=_auth_header,
+            vertex_project=vertex_project,
+            vertex_location=vertex_location,
+            vertex_credentials=vertex_credentials,
+            stream=stream,
+            custom_llm_provider=custom_llm_provider,
+            api_base=api_base,
+            should_use_v1beta1_features=should_use_v1beta1_features,
+        )
+        headers = set_headers(auth_header=auth_header, extra_headers=extra_headers)
+
        ## TRANSFORMATION ##
        data = sync_transform_request_body(**transform_request_params)

@ -1460,6 +1341,7 @@ class VertexLLM(BaseLLM):
            },
        )

+        ## SYNC STREAMING CALL ##
        if stream is True:
            request_data_str = json.dumps(data)
            streaming_response = CustomStreamWrapper(
--- a/litellm/llms/vertex_ai_and_google_ai_studio/gemini_embeddings/batch_embed_content_handler.py
+++ b/litellm/llms/vertex_ai_and_google_ai_studio/gemini_embeddings/batch_embed_content_handler.py
@ -43,8 +43,14 @@ class GoogleBatchEmbeddings(VertexLLM):
        client=None,
    ) -> EmbeddingResponse:

+        _auth_header, vertex_project = self._ensure_access_token(
+            credentials=vertex_credentials,
+            project_id=vertex_project,
+        )
+
        auth_header, url = self._get_token_and_url(
            model=model,
+            auth_header=_auth_header,
            gemini_api_key=api_key,
            vertex_project=vertex_project,
            vertex_location=vertex_location,
--- a/litellm/llms/vertex_ai_and_google_ai_studio/multimodal_embeddings/embedding_handler.py
+++ b/litellm/llms/vertex_ai_and_google_ai_studio/multimodal_embeddings/embedding_handler.py
@ -43,8 +43,15 @@ class VertexMultimodalEmbedding(VertexLLM):
        timeout=300,
        client=None,
    ):
+
+        _auth_header, vertex_project = self._ensure_access_token(
+            credentials=vertex_credentials,
+            project_id=vertex_project,
+        )
+
        auth_header, url = self._get_token_and_url(
            model=model,
+            auth_header=_auth_header,
            gemini_api_key=api_key,
            vertex_project=vertex_project,
            vertex_location=vertex_location,
--- a/litellm/llms/vertex_ai_and_google_ai_studio/text_to_speech/text_to_speech_handler.py
+++ b/litellm/llms/vertex_ai_and_google_ai_studio/text_to_speech/text_to_speech_handler.py
@ -65,8 +65,15 @@ class VertexTextToSpeechAPI(VertexLLM):
        import base64

        ####### Authenticate with Vertex AI ########
+
+        _auth_header, vertex_project = self._ensure_access_token(
+            credentials=vertex_credentials,
+            project_id=vertex_project,
+        )
+
        auth_header, _ = self._get_token_and_url(
            model="",
+            auth_header=_auth_header,
            gemini_api_key=None,
            vertex_credentials=vertex_credentials,
            vertex_project=vertex_project,
--- a/litellm/llms/vertex_ai_and_google_ai_studio/vertex_llm_base.py
+++ b/litellm/llms/vertex_ai_and_google_ai_studio/vertex_llm_base.py
@ -0,0 +1,255 @@
+import json
+import os
+from typing import TYPE_CHECKING, Any, Dict, Literal, Optional, Tuple
+
+from litellm._logging import verbose_logger
+from litellm.litellm_core_utils.asyncify import asyncify
+from litellm.llms.base import BaseLLM
+from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
+
+from .common_utils import (
+    VertexAIError,
+    _get_gemini_url,
+    _get_vertex_url,
+    all_gemini_url_modes,
+    get_supports_system_message,
+)
+
+if TYPE_CHECKING:
+    from google.auth.credentials import Credentials as GoogleCredentialsObject
+else:
+    GoogleCredentialsObject = Any
+
+
+class VertexBase(BaseLLM):
+    def __init__(self) -> None:
+        super().__init__()
+        self.access_token: Optional[str] = None
+        self.refresh_token: Optional[str] = None
+        self._credentials: Optional[GoogleCredentialsObject] = None
+        self.project_id: Optional[str] = None
+        self.async_handler: Optional[AsyncHTTPHandler] = None
+
+    def get_vertex_region(self, vertex_region: Optional[str]) -> str:
+        return vertex_region or "us-central1"
+
+    def load_auth(
+        self, credentials: Optional[str], project_id: Optional[str]
+    ) -> Tuple[Any, str]:
+        import google.auth as google_auth
+        from google.auth import identity_pool
+        from google.auth.credentials import Credentials  # type: ignore[import-untyped]
+        from google.auth.transport.requests import (
+            Request,  # type: ignore[import-untyped]
+        )
+
+        if credentials is not None and isinstance(credentials, str):
+            import google.oauth2.service_account
+
+            verbose_logger.debug(
+                "Vertex: Loading vertex credentials from %s", credentials
+            )
+            verbose_logger.debug(
+                "Vertex: checking if credentials is a valid path, os.path.exists(%s)=%s, current dir %s",
+                credentials,
+                os.path.exists(credentials),
+                os.getcwd(),
+            )
+
+            try:
+                if os.path.exists(credentials):
+                    json_obj = json.load(open(credentials))
+                else:
+                    json_obj = json.loads(credentials)
+            except Exception:
+                raise Exception(
+                    "Unable to load vertex credentials from environment. Got={}".format(
+                        credentials
+                    )
+                )
+
+            # Check if the JSON object contains Workload Identity Federation configuration
+            if "type" in json_obj and json_obj["type"] == "external_account":
+                creds = identity_pool.Credentials.from_info(json_obj)
+            else:
+                creds = (
+                    google.oauth2.service_account.Credentials.from_service_account_info(
+                        json_obj,
+                        scopes=["https://www.googleapis.com/auth/cloud-platform"],
+                    )
+                )
+
+            if project_id is None:
+                project_id = creds.project_id
+        else:
+            creds, creds_project_id = google_auth.default(
+                quota_project_id=project_id,
+                scopes=["https://www.googleapis.com/auth/cloud-platform"],
+            )
+            if project_id is None:
+                project_id = creds_project_id
+
+        creds.refresh(Request())
+
+        if not project_id:
+            raise ValueError("Could not resolve project_id")
+
+        if not isinstance(project_id, str):
+            raise TypeError(
+                f"Expected project_id to be a str but got {type(project_id)}"
+            )
+
+        return creds, project_id
+
+    def refresh_auth(self, credentials: Any) -> None:
+        from google.auth.transport.requests import (
+            Request,  # type: ignore[import-untyped]
+        )
+
+        credentials.refresh(Request())
+
+    def _ensure_access_token(
+        self, credentials: Optional[str], project_id: Optional[str]
+    ) -> Tuple[str, str]:
+        """
+        Returns auth token and project id
+        """
+        if self.access_token is not None:
+            if project_id is not None:
+                return self.access_token, project_id
+            elif self.project_id is not None:
+                return self.access_token, self.project_id
+
+        if not self._credentials:
+            self._credentials, cred_project_id = self.load_auth(
+                credentials=credentials, project_id=project_id
+            )
+            if not self.project_id:
+                self.project_id = project_id or cred_project_id
+        else:
+            if self._credentials.expired or not self._credentials.token:
+                self.refresh_auth(self._credentials)
+
+            if not self.project_id:
+                self.project_id = self._credentials.quota_project_id
+
+        if not self.project_id:
+            raise ValueError("Could not resolve project_id")
+
+        if not self._credentials or not self._credentials.token:
+            raise RuntimeError("Could not resolve API token from the environment")
+
+        return self._credentials.token, project_id or self.project_id
+
+    def is_using_v1beta1_features(self, optional_params: dict) -> bool:
+        """
+        VertexAI only supports ContextCaching on v1beta1
+
+        use this helper to decide if request should be sent to v1 or v1beta1
+
+        Returns v1beta1 if context caching is enabled
+        Returns v1 in all other cases
+        """
+        if "cached_content" in optional_params:
+            return True
+        if "CachedContent" in optional_params:
+            return True
+        return False
+
+    def _get_token_and_url(
+        self,
+        model: str,
+        auth_header: str,
+        gemini_api_key: Optional[str],
+        vertex_project: Optional[str],
+        vertex_location: Optional[str],
+        vertex_credentials: Optional[str],
+        stream: Optional[bool],
+        custom_llm_provider: Literal["vertex_ai", "vertex_ai_beta", "gemini"],
+        api_base: Optional[str],
+        should_use_v1beta1_features: Optional[bool] = False,
+        mode: all_gemini_url_modes = "chat",
+    ) -> Tuple[Optional[str], str]:
+        """
+        Internal function. Returns the token and url for the call.
+
+        Handles logic if it's google ai studio vs. vertex ai.
+
+        Returns
+            token, url
+        """
+        if custom_llm_provider == "gemini":
+            url, endpoint = _get_gemini_url(
+                mode=mode,
+                model=model,
+                stream=stream,
+                gemini_api_key=gemini_api_key,
+            )
+        else:
+            vertex_location = self.get_vertex_region(vertex_region=vertex_location)
+
+            ### SET RUNTIME ENDPOINT ###
+            version: Literal["v1beta1", "v1"] = (
+                "v1beta1" if should_use_v1beta1_features is True else "v1"
+            )
+            url, endpoint = _get_vertex_url(
+                mode=mode,
+                model=model,
+                stream=stream,
+                vertex_project=vertex_project,
+                vertex_location=vertex_location,
+                vertex_api_version=version,
+            )
+
+        if (
+            api_base is not None
+        ):  # for cloudflare ai gateway - https://github.com/BerriAI/litellm/issues/4317
+            if custom_llm_provider == "gemini":
+                url = "{}:{}".format(api_base, endpoint)
+                if gemini_api_key is None:
+                    raise ValueError(
+                        "Missing gemini_api_key, please set `GEMINI_API_KEY`"
+                    )
+                auth_header = (
+                    gemini_api_key  # cloudflare expects api key as bearer token
+                )
+            else:
+                url = "{}:{}".format(api_base, endpoint)
+
+            if stream is True:
+                url = url + "?alt=sse"
+
+        return auth_header, url
+
+    async def _ensure_access_token_async(
+        self, credentials: Optional[str], project_id: Optional[str]
+    ) -> Tuple[str, str]:
+        """
+        Async version of _ensure_access_token
+        """
+        if self.access_token is not None:
+            if project_id is not None:
+                return self.access_token, project_id
+            elif self.project_id is not None:
+                return self.access_token, self.project_id
+
+        if not self._credentials:
+            self._credentials, cred_project_id = await asyncify(self.load_auth)(
+                credentials=credentials, project_id=project_id
+            )
+            if not self.project_id:
+                self.project_id = project_id or cred_project_id
+        else:
+            if self._credentials.expired or not self._credentials.token:
+                await asyncify(self.refresh_auth)(self._credentials)
+
+            if not self.project_id:
+                self.project_id = self._credentials.quota_project_id
+
+        if not self.project_id:
+            raise ValueError("Could not resolve project_id")
+
+        if not self._credentials or not self._credentials.token:
+            raise RuntimeError("Could not resolve API token from the environment")
+
+        return self._credentials.token, project_id or self.project_id
--- a/litellm/proxy/vertex_ai_endpoints/vertex_endpoints.py
+++ b/litellm/proxy/vertex_ai_endpoints/vertex_endpoints.py
@ -150,8 +150,15 @@ async def vertex_proxy_route(

        base_target_url = f"https://{vertex_location}-aiplatform.googleapis.com/"

+        _auth_header, vertex_project = (
+            await vertex_fine_tuning_apis_instance._ensure_access_token_async(
+                credentials=vertex_credentials, project_id=vertex_project
+            )
+        )
+
        auth_header, _ = vertex_fine_tuning_apis_instance._get_token_and_url(
            model="",
+            auth_header=_auth_header,
            gemini_api_key=None,
            vertex_credentials=vertex_credentials,
            vertex_project=vertex_project,