Merge branch 'main' into litellm_fix_vertex_ai_ft_models

This commit is contained in:
Ishaan Jaff 2025-03-26 11:11:54 -07:00
commit 0767a3cc88
15 changed files with 214 additions and 48 deletions

View file

@ -9,7 +9,11 @@ commands:
- run: - run:
name: "Configure Google DNS" name: "Configure Google DNS"
command: | command: |
echo "nameserver 8.8.8.8" | sudo tee /etc/resolv.conf # Backup original resolv.conf
sudo cp /etc/resolv.conf /etc/resolv.conf.backup
# Add both local and Google DNS servers
echo "nameserver 127.0.0.11" | sudo tee /etc/resolv.conf
echo "nameserver 8.8.8.8" | sudo tee -a /etc/resolv.conf
echo "nameserver 8.8.4.4" | sudo tee -a /etc/resolv.conf echo "nameserver 8.8.4.4" | sudo tee -a /etc/resolv.conf
jobs: jobs:
@ -243,6 +247,12 @@ jobs:
steps: steps:
- checkout - checkout
- setup_google_dns - setup_google_dns
- run:
name: DNS lookup for Redis host
command: |
sudo apt-get update
sudo apt-get install -y dnsutils
dig redis-19899.c239.us-east-1-2.ec2.redns.redis-cloud.com +short
- run: - run:
name: Show git commit hash name: Show git commit hash
command: | command: |
@ -630,7 +640,6 @@ jobs:
steps: steps:
- checkout - checkout
- setup_google_dns
- run: - run:
name: Install Dependencies name: Install Dependencies
command: | command: |

View file

@ -828,11 +828,14 @@ def get_response_cost_from_hidden_params(
_hidden_params_dict = hidden_params _hidden_params_dict = hidden_params
additional_headers = _hidden_params_dict.get("additional_headers", {}) additional_headers = _hidden_params_dict.get("additional_headers", {})
if additional_headers and "x-litellm-response-cost" in additional_headers: if (
response_cost = additional_headers["x-litellm-response-cost"] additional_headers
and "llm_provider-x-litellm-response-cost" in additional_headers
):
response_cost = additional_headers["llm_provider-x-litellm-response-cost"]
if response_cost is None: if response_cost is None:
return None return None
return float(additional_headers["x-litellm-response-cost"]) return float(additional_headers["llm_provider-x-litellm-response-cost"])
return None return None

View file

@ -60,7 +60,9 @@ def get_supports_response_schema(
from typing import Literal, Optional from typing import Literal, Optional
all_gemini_url_modes = Literal["chat", "embedding", "batch_embedding"] all_gemini_url_modes = Literal[
"chat", "embedding", "batch_embedding", "image_generation"
]
def _get_vertex_url( def _get_vertex_url(
@ -96,7 +98,11 @@ def _get_vertex_url(
if model.isdigit(): if model.isdigit():
# https://us-central1-aiplatform.googleapis.com/v1/projects/$PROJECT_ID/locations/us-central1/endpoints/$ENDPOINT_ID:predict # https://us-central1-aiplatform.googleapis.com/v1/projects/$PROJECT_ID/locations/us-central1/endpoints/$ENDPOINT_ID:predict
url = f"https://{vertex_location}-aiplatform.googleapis.com/{vertex_api_version}/projects/{vertex_project}/locations/{vertex_location}/endpoints/{model}:{endpoint}" url = f"https://{vertex_location}-aiplatform.googleapis.com/{vertex_api_version}/projects/{vertex_project}/locations/{vertex_location}/endpoints/{model}:{endpoint}"
elif mode == "image_generation":
endpoint = "predict"
url = f"https://{vertex_location}-aiplatform.googleapis.com/v1/projects/{vertex_project}/locations/{vertex_location}/publishers/google/models/{model}:{endpoint}"
if model.isdigit():
url = f"https://{vertex_location}-aiplatform.googleapis.com/{vertex_api_version}/projects/{vertex_project}/locations/{vertex_location}/endpoints/{model}:{endpoint}"
if not url or not endpoint: if not url or not endpoint:
raise ValueError(f"Unable to get vertex url/endpoint for mode: {mode}") raise ValueError(f"Unable to get vertex url/endpoint for mode: {mode}")
return url, endpoint return url, endpoint
@ -132,6 +138,10 @@ def _get_gemini_url(
url = "https://generativelanguage.googleapis.com/v1beta/{}:{}?key={}".format( url = "https://generativelanguage.googleapis.com/v1beta/{}:{}?key={}".format(
_gemini_model_name, endpoint, gemini_api_key _gemini_model_name, endpoint, gemini_api_key
) )
elif mode == "image_generation":
raise ValueError(
"LiteLLM's `gemini/` route does not support image generation yet. Let us know if you need this feature by opening an issue at https://github.com/BerriAI/litellm/issues"
)
return url, endpoint return url, endpoint

View file

@ -43,22 +43,23 @@ class VertexImageGeneration(VertexLLM):
def image_generation( def image_generation(
self, self,
prompt: str, prompt: str,
api_base: Optional[str],
vertex_project: Optional[str], vertex_project: Optional[str],
vertex_location: Optional[str], vertex_location: Optional[str],
vertex_credentials: Optional[VERTEX_CREDENTIALS_TYPES], vertex_credentials: Optional[VERTEX_CREDENTIALS_TYPES],
model_response: ImageResponse, model_response: ImageResponse,
logging_obj: Any, logging_obj: Any,
model: Optional[ model: str = "imagegeneration", # vertex ai uses imagegeneration as the default model
str
] = "imagegeneration", # vertex ai uses imagegeneration as the default model
client: Optional[Any] = None, client: Optional[Any] = None,
optional_params: Optional[dict] = None, optional_params: Optional[dict] = None,
timeout: Optional[int] = None, timeout: Optional[int] = None,
aimg_generation=False, aimg_generation=False,
extra_headers: Optional[dict] = None,
) -> ImageResponse: ) -> ImageResponse:
if aimg_generation is True: if aimg_generation is True:
return self.aimage_generation( # type: ignore return self.aimage_generation( # type: ignore
prompt=prompt, prompt=prompt,
api_base=api_base,
vertex_project=vertex_project, vertex_project=vertex_project,
vertex_location=vertex_location, vertex_location=vertex_location,
vertex_credentials=vertex_credentials, vertex_credentials=vertex_credentials,
@ -83,13 +84,27 @@ class VertexImageGeneration(VertexLLM):
else: else:
sync_handler = client # type: ignore sync_handler = client # type: ignore
url = f"https://{vertex_location}-aiplatform.googleapis.com/v1/projects/{vertex_project}/locations/{vertex_location}/publishers/google/models/{model}:predict" # url = f"https://{vertex_location}-aiplatform.googleapis.com/v1/projects/{vertex_project}/locations/{vertex_location}/publishers/google/models/{model}:predict"
auth_header: Optional[str] = None
auth_header, _ = self._ensure_access_token( auth_header, _ = self._ensure_access_token(
credentials=vertex_credentials, credentials=vertex_credentials,
project_id=vertex_project, project_id=vertex_project,
custom_llm_provider="vertex_ai", custom_llm_provider="vertex_ai",
) )
auth_header, api_base = self._get_token_and_url(
model=model,
gemini_api_key=None,
auth_header=auth_header,
vertex_project=vertex_project,
vertex_location=vertex_location,
vertex_credentials=vertex_credentials,
stream=False,
custom_llm_provider="vertex_ai",
api_base=api_base,
should_use_v1beta1_features=False,
mode="image_generation",
)
optional_params = optional_params or { optional_params = optional_params or {
"sampleCount": 1 "sampleCount": 1
} # default optional params } # default optional params
@ -99,31 +114,21 @@ class VertexImageGeneration(VertexLLM):
"parameters": optional_params, "parameters": optional_params,
} }
request_str = f"\n curl -X POST \\\n -H \"Authorization: Bearer {auth_header[:10] + 'XXXXXXXXXX'}\" \\\n -H \"Content-Type: application/json; charset=utf-8\" \\\n -d {request_data} \\\n \"{url}\"" headers = self.set_headers(auth_header=auth_header, extra_headers=extra_headers)
logging_obj.pre_call(
input=prompt,
api_key=None,
additional_args={
"complete_input_dict": optional_params,
"request_str": request_str,
},
)
logging_obj.pre_call( logging_obj.pre_call(
input=prompt, input=prompt,
api_key=None, api_key="",
additional_args={ additional_args={
"complete_input_dict": optional_params, "complete_input_dict": optional_params,
"request_str": request_str, "api_base": api_base,
"headers": headers,
}, },
) )
response = sync_handler.post( response = sync_handler.post(
url=url, url=api_base,
headers={ headers=headers,
"Content-Type": "application/json; charset=utf-8",
"Authorization": f"Bearer {auth_header}",
},
data=json.dumps(request_data), data=json.dumps(request_data),
) )
@ -138,17 +143,17 @@ class VertexImageGeneration(VertexLLM):
async def aimage_generation( async def aimage_generation(
self, self,
prompt: str, prompt: str,
api_base: Optional[str],
vertex_project: Optional[str], vertex_project: Optional[str],
vertex_location: Optional[str], vertex_location: Optional[str],
vertex_credentials: Optional[VERTEX_CREDENTIALS_TYPES], vertex_credentials: Optional[VERTEX_CREDENTIALS_TYPES],
model_response: litellm.ImageResponse, model_response: litellm.ImageResponse,
logging_obj: Any, logging_obj: Any,
model: Optional[ model: str = "imagegeneration", # vertex ai uses imagegeneration as the default model
str
] = "imagegeneration", # vertex ai uses imagegeneration as the default model
client: Optional[AsyncHTTPHandler] = None, client: Optional[AsyncHTTPHandler] = None,
optional_params: Optional[dict] = None, optional_params: Optional[dict] = None,
timeout: Optional[int] = None, timeout: Optional[int] = None,
extra_headers: Optional[dict] = None,
): ):
response = None response = None
if client is None: if client is None:
@ -169,7 +174,6 @@ class VertexImageGeneration(VertexLLM):
# make POST request to # make POST request to
# https://us-central1-aiplatform.googleapis.com/v1/projects/PROJECT_ID/locations/us-central1/publishers/google/models/imagegeneration:predict # https://us-central1-aiplatform.googleapis.com/v1/projects/PROJECT_ID/locations/us-central1/publishers/google/models/imagegeneration:predict
url = f"https://{vertex_location}-aiplatform.googleapis.com/v1/projects/{vertex_project}/locations/{vertex_location}/publishers/google/models/{model}:predict"
""" """
Docs link: https://console.cloud.google.com/vertex-ai/publishers/google/model-garden/imagegeneration?project=adroit-crow-413218 Docs link: https://console.cloud.google.com/vertex-ai/publishers/google/model-garden/imagegeneration?project=adroit-crow-413218
@ -188,11 +192,25 @@ class VertexImageGeneration(VertexLLM):
} \ } \
"https://us-central1-aiplatform.googleapis.com/v1/projects/PROJECT_ID/locations/us-central1/publishers/google/models/imagegeneration:predict" "https://us-central1-aiplatform.googleapis.com/v1/projects/PROJECT_ID/locations/us-central1/publishers/google/models/imagegeneration:predict"
""" """
auth_header: Optional[str] = None
auth_header, _ = self._ensure_access_token( auth_header, _ = self._ensure_access_token(
credentials=vertex_credentials, credentials=vertex_credentials,
project_id=vertex_project, project_id=vertex_project,
custom_llm_provider="vertex_ai", custom_llm_provider="vertex_ai",
) )
auth_header, api_base = self._get_token_and_url(
model=model,
gemini_api_key=None,
auth_header=auth_header,
vertex_project=vertex_project,
vertex_location=vertex_location,
vertex_credentials=vertex_credentials,
stream=False,
custom_llm_provider="vertex_ai",
api_base=api_base,
should_use_v1beta1_features=False,
mode="image_generation",
)
optional_params = optional_params or { optional_params = optional_params or {
"sampleCount": 1 "sampleCount": 1
} # default optional params } # default optional params
@ -202,22 +220,21 @@ class VertexImageGeneration(VertexLLM):
"parameters": optional_params, "parameters": optional_params,
} }
request_str = f"\n curl -X POST \\\n -H \"Authorization: Bearer {auth_header[:10] + 'XXXXXXXXXX'}\" \\\n -H \"Content-Type: application/json; charset=utf-8\" \\\n -d {request_data} \\\n \"{url}\"" headers = self.set_headers(auth_header=auth_header, extra_headers=extra_headers)
logging_obj.pre_call( logging_obj.pre_call(
input=prompt, input=prompt,
api_key=None, api_key="",
additional_args={ additional_args={
"complete_input_dict": optional_params, "complete_input_dict": optional_params,
"request_str": request_str, "api_base": api_base,
"headers": headers,
}, },
) )
response = await self.async_handler.post( response = await self.async_handler.post(
url=url, url=api_base,
headers={ headers=headers,
"Content-Type": "application/json; charset=utf-8",
"Authorization": f"Bearer {auth_header}",
},
data=json.dumps(request_data), data=json.dumps(request_data),
) )

View file

@ -111,7 +111,7 @@ class VertexEmbedding(VertexBase):
) )
try: try:
response = client.post(api_base, headers=headers, json=vertex_request) # type: ignore response = client.post(url=api_base, headers=headers, json=vertex_request) # type: ignore
response.raise_for_status() response.raise_for_status()
except httpx.HTTPStatusError as err: except httpx.HTTPStatusError as err:
error_code = err.response.status_code error_code = err.response.status_code

View file

@ -4732,6 +4732,7 @@ def image_generation( # noqa: PLR0915
vertex_location=vertex_ai_location, vertex_location=vertex_ai_location,
vertex_credentials=vertex_credentials, vertex_credentials=vertex_credentials,
aimg_generation=aimg_generation, aimg_generation=aimg_generation,
api_base=api_base,
client=client, client=client,
) )
elif ( elif (

View file

@ -4694,6 +4694,8 @@
"output_cost_per_token": 0.0000003, "output_cost_per_token": 0.0000003,
"litellm_provider": "gemini", "litellm_provider": "gemini",
"mode": "chat", "mode": "chat",
"tpm": 4000000,
"rpm": 4000,
"supports_system_messages": true, "supports_system_messages": true,
"supports_function_calling": true, "supports_function_calling": true,
"supports_vision": true, "supports_vision": true,

View file

@ -1,5 +1,5 @@
model_list: model_list:
- model_name: "gpt-3.5-turbo" - model_name: "gpt-4o"
litellm_params: litellm_params:
model: azure/chatgpt-v-2 model: azure/chatgpt-v-2
api_key: os.environ/AZURE_API_KEY api_key: os.environ/AZURE_API_KEY

View file

@ -7,7 +7,7 @@ from litellm.proxy._types import GenerateKeyRequest, UserAPIKeyAuth
async def user_api_key_auth(request: Request, api_key: str) -> UserAPIKeyAuth: async def user_api_key_auth(request: Request, api_key: str) -> UserAPIKeyAuth:
try: try:
modified_master_key = f"{os.getenv('PROXY_MASTER_KEY')}-1234" modified_master_key = f"{os.getenv('LITELLM_MASTER_KEY')}-1234"
if api_key == modified_master_key: if api_key == modified_master_key:
return UserAPIKeyAuth(api_key=api_key) return UserAPIKeyAuth(api_key=api_key)
raise Exception raise Exception

View file

@ -4694,6 +4694,8 @@
"output_cost_per_token": 0.0000003, "output_cost_per_token": 0.0000003,
"litellm_provider": "gemini", "litellm_provider": "gemini",
"mode": "chat", "mode": "chat",
"tpm": 4000000,
"rpm": 4000,
"supports_system_messages": true, "supports_system_messages": true,
"supports_function_calling": true, "supports_function_calling": true,
"supports_vision": true, "supports_vision": true,

View file

@ -15,9 +15,11 @@ from pydantic import BaseModel
from litellm.cost_calculator import response_cost_calculator from litellm.cost_calculator import response_cost_calculator
def test_cost_calculator(): def test_cost_calculator_with_response_cost_in_additional_headers():
class MockResponse(BaseModel): class MockResponse(BaseModel):
_hidden_params = {"additional_headers": {"x-litellm-response-cost": 1000}} _hidden_params = {
"additional_headers": {"llm_provider-x-litellm-response-cost": 1000}
}
result = response_cost_calculator( result = response_cost_calculator(
response_object=MockResponse(), response_object=MockResponse(),

View file

@ -31,7 +31,7 @@ async def test_litellm_gateway_from_sdk():
openai_client = OpenAI(api_key="fake-key") openai_client = OpenAI(api_key="fake-key")
with patch.object( with patch.object(
openai_client.chat.completions, "create", new=MagicMock() openai_client.chat.completions.with_raw_response, "create", new=MagicMock()
) as mock_call: ) as mock_call:
try: try:
completion( completion(
@ -374,3 +374,78 @@ async def test_litellm_gateway_from_sdk_rerank(is_async):
assert request_body["query"] == "What is machine learning?" assert request_body["query"] == "What is machine learning?"
assert request_body["model"] == "rerank-english-v2.0" assert request_body["model"] == "rerank-english-v2.0"
assert len(request_body["documents"]) == 2 assert len(request_body["documents"]) == 2
def test_litellm_gateway_from_sdk_with_response_cost_in_additional_headers():
litellm.set_verbose = True
litellm._turn_on_debug()
from openai import OpenAI
openai_client = OpenAI(api_key="fake-key")
# Create mock response object
mock_response = MagicMock()
mock_response.headers = {"x-litellm-response-cost": "120"}
mock_response.parse.return_value = litellm.ModelResponse(
**{
"id": "chatcmpl-BEkxQvRGp9VAushfAsOZCbhMFLsoy",
"choices": [
{
"finish_reason": "stop",
"index": 0,
"logprobs": None,
"message": {
"content": "Hello! How can I assist you today?",
"refusal": None,
"role": "assistant",
"annotations": [],
"audio": None,
"function_call": None,
"tool_calls": None,
},
}
],
"created": 1742856796,
"model": "gpt-4o-2024-08-06",
"object": "chat.completion",
"service_tier": "default",
"system_fingerprint": "fp_6ec83003ad",
"usage": {
"completion_tokens": 10,
"prompt_tokens": 9,
"total_tokens": 19,
"completion_tokens_details": {
"accepted_prediction_tokens": 0,
"audio_tokens": 0,
"reasoning_tokens": 0,
"rejected_prediction_tokens": 0,
},
"prompt_tokens_details": {"audio_tokens": 0, "cached_tokens": 0},
},
}
)
with patch.object(
openai_client.chat.completions.with_raw_response,
"create",
return_value=mock_response,
) as mock_call:
response = litellm.completion(
model="litellm_proxy/gpt-4o",
messages=[{"role": "user", "content": "Hello world"}],
api_base="http://0.0.0.0:4000",
api_key="sk-PIp1h0RekR",
client=openai_client,
)
# Assert the headers were properly passed through
print(f"additional_headers: {response._hidden_params['additional_headers']}")
assert (
response._hidden_params["additional_headers"][
"llm_provider-x-litellm-response-cost"
]
== "120"
)
assert response._hidden_params["response_cost"] == 120

View file

@ -59,7 +59,7 @@ def load_vertex_ai_credentials():
async def create_async_vertex_embedding_task(): async def create_async_vertex_embedding_task():
load_vertex_ai_credentials() load_vertex_ai_credentials()
base_url = "https://exampleopenaiendpoint-production.up.railway.app/v1/projects/pathrise-convert-1606954137718/locations/us-central1/publishers/google/models/embedding-gecko-001:predict" base_url = "https://exampleopenaiendpoint-production.up.railway.app/v1/projects/pathrise-convert-1606954137718/locations/us-central1/publishers/google/models/textembedding-gecko@001"
embedding_args = { embedding_args = {
"model": "vertex_ai/textembedding-gecko", "model": "vertex_ai/textembedding-gecko",
"input": "This is a test sentence for embedding.", "input": "This is a test sentence for embedding.",
@ -109,12 +109,13 @@ def analyze_results(vertex_times):
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_embedding_performance(): async def test_embedding_performance(monkeypatch):
""" """
Run load test on vertex AI embeddings to ensure vertex median response time is less than 300ms Run load test on vertex AI embeddings to ensure vertex median response time is less than 300ms
20 RPS for 20 seconds 20 RPS for 20 seconds
""" """
monkeypatch.setattr(litellm, "api_base", None)
duration_seconds = 20 duration_seconds = 20
requests_per_second = 20 requests_per_second = 20
vertex_times = await run_load_test(duration_seconds, requests_per_second) vertex_times = await run_load_test(duration_seconds, requests_per_second)

View file

@ -31,6 +31,7 @@ from litellm import (
completion, completion,
completion_cost, completion_cost,
embedding, embedding,
image_generation,
) )
from litellm.llms.vertex_ai.gemini.transformation import ( from litellm.llms.vertex_ai.gemini.transformation import (
_gemini_convert_messages_with_history, _gemini_convert_messages_with_history,
@ -3419,3 +3420,46 @@ def test_gemini_fine_tuned_model_request_consistency():
second_json = json.dumps(second_request_body, indent=2).splitlines() second_json = json.dumps(second_request_body, indent=2).splitlines()
# Assert there is no difference between the request bodies # Assert there is no difference between the request bodies
assert first_json == second_json, "Request bodies should be identical" assert first_json == second_json, "Request bodies should be identical"
@pytest.mark.parametrize("provider", ["vertex_ai", "gemini"])
@pytest.mark.parametrize("route", ["completion", "embedding", "image_generation"])
def test_litellm_api_base(monkeypatch, provider, route):
from litellm.llms.custom_httpx.http_handler import HTTPHandler
client = HTTPHandler()
import litellm
monkeypatch.setattr(litellm, "api_base", "https://litellm.com")
load_vertex_ai_credentials()
if route == "image_generation" and provider == "gemini":
pytest.skip("Gemini does not support image generation")
with patch.object(client, "post", new=MagicMock()) as mock_client:
try:
if route == "completion":
response = completion(
model=f"{provider}/gemini-2.0-flash-001",
messages=[{"role": "user", "content": "Hello, world!"}],
client=client,
)
elif route == "embedding":
response = embedding(
model=f"{provider}/gemini-2.0-flash-001",
input=["Hello, world!"],
client=client,
)
elif route == "image_generation":
response = image_generation(
model=f"{provider}/gemini-2.0-flash-001",
prompt="Hello, world!",
client=client,
)
except Exception as e:
print(e)
mock_client.assert_called()
assert mock_client.call_args.kwargs["url"].startswith("https://litellm.com")

View file

@ -11,7 +11,7 @@ import os
sys.path.insert( sys.path.insert(
0, os.path.abspath("../..") 0, os.path.abspath("../..")
) # Adds the parent directory to the system-path ) # Adds the parent directory to the system path
import os import os