import os import sys import traceback from dotenv import load_dotenv load_dotenv() import io import os from test_streaming import streaming_format_tests sys.path.insert( 0, os.path.abspath("../..") ) # Adds the parent directory to the system path import asyncio import json import os import tempfile from unittest.mock import AsyncMock, MagicMock, patch from respx import MockRouter import httpx import pytest import litellm from litellm import ( RateLimitError, Timeout, acompletion, completion, completion_cost, embedding, ) from litellm.llms.vertex_ai_and_google_ai_studio.gemini.transformation import ( _gemini_convert_messages_with_history, ) from litellm.llms.vertex_ai_and_google_ai_studio.vertex_llm_base import VertexBase litellm.num_retries = 3 litellm.cache = None user_message = "Write a short poem about the sky" messages = [{"content": user_message, "role": "user"}] VERTEX_MODELS_TO_NOT_TEST = [ "medlm-medium", "medlm-large", "code-gecko", "code-gecko@001", "code-gecko@002", "code-gecko@latest", "codechat-bison@latest", "code-bison@001", "text-bison@001", "gemini-1.5-pro", "gemini-1.5-pro-preview-0215", "gemini-pro-experimental", "gemini-flash-experimental", "gemini-1.5-flash-exp-0827", "gemini-pro-flash", "gemini-1.5-flash-exp-0827", ] def get_vertex_ai_creds_json() -> dict: # Define the path to the vertex_key.json file print("loading vertex ai credentials") filepath = os.path.dirname(os.path.abspath(__file__)) vertex_key_path = filepath + "/vertex_key.json" # Read the existing content of the file or create an empty dictionary try: with open(vertex_key_path, "r") as file: # Read the file content print("Read vertexai file path") content = file.read() # If the file is empty or not valid JSON, create an empty dictionary if not content or not content.strip(): service_account_key_data = {} else: # Attempt to load the existing JSON content file.seek(0) service_account_key_data = json.load(file) except FileNotFoundError: # If the file doesn't exist, create an empty dictionary service_account_key_data = {} # Update the service_account_key_data with environment variables private_key_id = os.environ.get("VERTEX_AI_PRIVATE_KEY_ID", "") private_key = os.environ.get("VERTEX_AI_PRIVATE_KEY", "") private_key = private_key.replace("\\n", "\n") service_account_key_data["private_key_id"] = private_key_id service_account_key_data["private_key"] = private_key return service_account_key_data def load_vertex_ai_credentials(): # Define the path to the vertex_key.json file print("loading vertex ai credentials") filepath = os.path.dirname(os.path.abspath(__file__)) vertex_key_path = filepath + "/vertex_key.json" # Read the existing content of the file or create an empty dictionary try: with open(vertex_key_path, "r") as file: # Read the file content print("Read vertexai file path") content = file.read() # If the file is empty or not valid JSON, create an empty dictionary if not content or not content.strip(): service_account_key_data = {} else: # Attempt to load the existing JSON content file.seek(0) service_account_key_data = json.load(file) except FileNotFoundError: # If the file doesn't exist, create an empty dictionary service_account_key_data = {} # Update the service_account_key_data with environment variables private_key_id = os.environ.get("VERTEX_AI_PRIVATE_KEY_ID", "") private_key = os.environ.get("VERTEX_AI_PRIVATE_KEY", "") private_key = private_key.replace("\\n", "\n") service_account_key_data["private_key_id"] = private_key_id service_account_key_data["private_key"] = private_key # Create a temporary file with tempfile.NamedTemporaryFile(mode="w+", delete=False) as temp_file: # Write the updated content to the temporary files json.dump(service_account_key_data, temp_file, indent=2) # Export the temporary file as GOOGLE_APPLICATION_CREDENTIALS os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = os.path.abspath(temp_file.name) @pytest.mark.asyncio async def test_get_response(): load_vertex_ai_credentials() prompt = '\ndef count_nums(arr):\n """\n Write a function count_nums which takes an array of integers and returns\n the number of elements which has a sum of digits > 0.\n If a number is negative, then its first signed digit will be negative:\n e.g. -123 has signed digits -1, 2, and 3.\n >>> count_nums([]) == 0\n >>> count_nums([-1, 11, -11]) == 1\n >>> count_nums([1, 1, 2]) == 3\n """\n' try: response = await acompletion( model="gemini-pro", messages=[ { "role": "system", "content": "Complete the given code with no more explanation. Remember that there is a 4-space indent before the first line of your generated code.", }, {"role": "user", "content": prompt}, ], ) return response except litellm.RateLimitError: pass except litellm.UnprocessableEntityError as e: pass except Exception as e: pytest.fail(f"An error occurred - {str(e)}") @pytest.mark.asyncio @pytest.mark.flaky(retries=3, delay=1) async def test_get_router_response(): model = "claude-3-sonnet@20240229" vertex_ai_project = "adroit-crow-413218" vertex_ai_location = "asia-southeast1" json_obj = get_vertex_ai_creds_json() vertex_credentials = json.dumps(json_obj) prompt = '\ndef count_nums(arr):\n """\n Write a function count_nums which takes an array of integers and returns\n the number of elements which has a sum of digits > 0.\n If a number is negative, then its first signed digit will be negative:\n e.g. -123 has signed digits -1, 2, and 3.\n >>> count_nums([]) == 0\n >>> count_nums([-1, 11, -11]) == 1\n >>> count_nums([1, 1, 2]) == 3\n """\n' try: router = litellm.Router( model_list=[ { "model_name": "sonnet", "litellm_params": { "model": "vertex_ai/claude-3-sonnet@20240229", "vertex_ai_project": vertex_ai_project, "vertex_ai_location": vertex_ai_location, "vertex_credentials": vertex_credentials, }, } ] ) response = await router.acompletion( model="sonnet", messages=[ { "role": "system", "content": "Complete the given code with no more explanation. Remember that there is a 4-space indent before the first line of your generated code.", }, {"role": "user", "content": prompt}, ], ) print(f"\n\nResponse: {response}\n\n") except litellm.ServiceUnavailableError: pass except litellm.UnprocessableEntityError as e: pass except Exception as e: pytest.fail(f"An error occurred - {str(e)}") # @pytest.mark.skip( # reason="Local test. Vertex AI Quota is low. Leads to rate limit errors on ci/cd." # ) @pytest.mark.flaky(retries=3, delay=1) def test_vertex_ai_anthropic(): model = "claude-3-sonnet@20240229" vertex_ai_project = "adroit-crow-413218" vertex_ai_location = "asia-southeast1" json_obj = get_vertex_ai_creds_json() vertex_credentials = json.dumps(json_obj) response = completion( model="vertex_ai/" + model, messages=[{"role": "user", "content": "hi"}], temperature=0.7, vertex_ai_project=vertex_ai_project, vertex_ai_location=vertex_ai_location, vertex_credentials=vertex_credentials, ) print("\nModel Response", response) # @pytest.mark.skip( # reason="Local test. Vertex AI Quota is low. Leads to rate limit errors on ci/cd." # ) @pytest.mark.flaky(retries=3, delay=1) def test_vertex_ai_anthropic_streaming(): try: load_vertex_ai_credentials() # litellm.set_verbose = True model = "claude-3-sonnet@20240229" vertex_ai_project = "adroit-crow-413218" vertex_ai_location = "asia-southeast1" json_obj = get_vertex_ai_creds_json() vertex_credentials = json.dumps(json_obj) response = completion( model="vertex_ai/" + model, messages=[{"role": "user", "content": "hi"}], temperature=0.7, vertex_ai_project=vertex_ai_project, vertex_ai_location=vertex_ai_location, stream=True, ) # print("\nModel Response", response) for idx, chunk in enumerate(response): print(f"chunk: {chunk}") streaming_format_tests(idx=idx, chunk=chunk) # raise Exception("it worked!") except litellm.RateLimitError as e: pass except Exception as e: pytest.fail(f"Error occurred: {e}") # test_vertex_ai_anthropic_streaming() # @pytest.mark.skip( # reason="Local test. Vertex AI Quota is low. Leads to rate limit errors on ci/cd." # ) @pytest.mark.asyncio @pytest.mark.flaky(retries=3, delay=1) async def test_vertex_ai_anthropic_async(): # load_vertex_ai_credentials() try: model = "claude-3-sonnet@20240229" vertex_ai_project = "adroit-crow-413218" vertex_ai_location = "asia-southeast1" json_obj = get_vertex_ai_creds_json() vertex_credentials = json.dumps(json_obj) response = await acompletion( model="vertex_ai/" + model, messages=[{"role": "user", "content": "hi"}], temperature=0.7, vertex_ai_project=vertex_ai_project, vertex_ai_location=vertex_ai_location, vertex_credentials=vertex_credentials, ) print(f"Model Response: {response}") except litellm.RateLimitError as e: pass except Exception as e: pytest.fail(f"Error occurred: {e}") # asyncio.run(test_vertex_ai_anthropic_async()) # @pytest.mark.skip( # reason="Local test. Vertex AI Quota is low. Leads to rate limit errors on ci/cd." # ) @pytest.mark.asyncio @pytest.mark.flaky(retries=3, delay=1) async def test_vertex_ai_anthropic_async_streaming(): # load_vertex_ai_credentials() try: litellm.set_verbose = True model = "claude-3-sonnet@20240229" vertex_ai_project = "adroit-crow-413218" vertex_ai_location = "asia-southeast1" json_obj = get_vertex_ai_creds_json() vertex_credentials = json.dumps(json_obj) response = await acompletion( model="vertex_ai/" + model, messages=[{"role": "user", "content": "hi"}], temperature=0.7, vertex_ai_project=vertex_ai_project, vertex_ai_location=vertex_ai_location, vertex_credentials=vertex_credentials, stream=True, ) idx = 0 async for chunk in response: streaming_format_tests(idx=idx, chunk=chunk) idx += 1 except litellm.RateLimitError as e: pass except Exception as e: pytest.fail(f"Error occurred: {e}") # asyncio.run(test_vertex_ai_anthropic_async_streaming()) @pytest.mark.flaky(retries=3, delay=1) def test_vertex_ai(): import random litellm.num_retries = 3 load_vertex_ai_credentials() test_models = ( litellm.vertex_chat_models + litellm.vertex_code_chat_models + litellm.vertex_text_models + litellm.vertex_code_text_models ) litellm.set_verbose = False vertex_ai_project = "adroit-crow-413218" # litellm.vertex_project = "adroit-crow-413218" test_models = random.sample(test_models, 1) test_models += litellm.vertex_language_models # always test gemini-pro for model in test_models: try: if model in VERTEX_MODELS_TO_NOT_TEST or ( "gecko" in model or "32k" in model or "ultra" in model or "002" in model ): # our account does not have access to this model continue print("making request", model) response = completion( model=model, messages=[{"role": "user", "content": "hi"}], temperature=0.7, vertex_ai_project=vertex_ai_project, ) print("\nModel Response", response) print(response) assert type(response.choices[0].message.content) == str assert len(response.choices[0].message.content) > 1 print( f"response.choices[0].finish_reason: {response.choices[0].finish_reason}" ) assert response.choices[0].finish_reason in litellm._openai_finish_reasons except litellm.RateLimitError as e: pass except litellm.InternalServerError as e: pass except Exception as e: pytest.fail(f"Error occurred: {e}") # test_vertex_ai() @pytest.mark.flaky(retries=3, delay=1) def test_vertex_ai_stream(): load_vertex_ai_credentials() litellm.set_verbose = True litellm.vertex_project = "adroit-crow-413218" import random test_models = ( litellm.vertex_chat_models + litellm.vertex_code_chat_models + litellm.vertex_text_models + litellm.vertex_code_text_models ) test_models = random.sample(test_models, 1) test_models += litellm.vertex_language_models # always test gemini-pro for model in test_models: try: if model in VERTEX_MODELS_TO_NOT_TEST or ( "gecko" in model or "32k" in model or "ultra" in model or "002" in model ): # our account does not have access to this model continue print("making request", model) response = completion( model=model, messages=[{"role": "user", "content": "hello tell me a short story"}], max_tokens=15, stream=True, ) completed_str = "" for chunk in response: print(chunk) content = chunk.choices[0].delta.content or "" print("\n content", content) completed_str += content assert type(content) == str # pass assert len(completed_str) > 1 except litellm.RateLimitError as e: pass except litellm.InternalServerError as e: pass except Exception as e: pytest.fail(f"Error occurred: {e}") # test_vertex_ai_stream() @pytest.mark.flaky(retries=3, delay=1) @pytest.mark.asyncio async def test_async_vertexai_response(): import random load_vertex_ai_credentials() test_models = ( litellm.vertex_chat_models + litellm.vertex_code_chat_models + litellm.vertex_text_models + litellm.vertex_code_text_models ) test_models = random.sample(test_models, 1) test_models += litellm.vertex_language_models # always test gemini-pro for model in test_models: print( f"model being tested in async call: {model}, litellm.vertex_language_models: {litellm.vertex_language_models}" ) if model in VERTEX_MODELS_TO_NOT_TEST or ( "gecko" in model or "32k" in model or "ultra" in model or "002" in model ): # our account does not have access to this model continue try: user_message = "Hello, how are you?" messages = [{"content": user_message, "role": "user"}] response = await acompletion( model=model, messages=messages, temperature=0.7, timeout=5 ) print(f"response: {response}") except litellm.RateLimitError as e: pass except litellm.Timeout as e: pass except litellm.APIError as e: pass except litellm.InternalServerError as e: pass except Exception as e: pytest.fail(f"An exception occurred: {e}") # asyncio.run(test_async_vertexai_response()) @pytest.mark.flaky(retries=3, delay=1) @pytest.mark.asyncio async def test_async_vertexai_streaming_response(): import random load_vertex_ai_credentials() test_models = ( litellm.vertex_chat_models + litellm.vertex_code_chat_models + litellm.vertex_text_models + litellm.vertex_code_text_models ) test_models = random.sample(test_models, 1) test_models += litellm.vertex_language_models # always test gemini-pro for model in test_models: if model in VERTEX_MODELS_TO_NOT_TEST or ( "gecko" in model or "32k" in model or "ultra" in model or "002" in model ): # our account does not have access to this model continue try: user_message = "Hello, how are you?" messages = [{"content": user_message, "role": "user"}] response = await acompletion( model=model, messages=messages, temperature=0.7, timeout=5, stream=True, ) print(f"response: {response}") complete_response = "" async for chunk in response: print(f"chunk: {chunk}") if chunk.choices[0].delta.content is not None: complete_response += chunk.choices[0].delta.content print(f"complete_response: {complete_response}") assert len(complete_response) > 0 except litellm.RateLimitError as e: pass except litellm.APIConnectionError: pass except litellm.Timeout as e: pass except litellm.InternalServerError as e: pass except Exception as e: print(e) pytest.fail(f"An exception occurred: {e}") # asyncio.run(test_async_vertexai_streaming_response()) @pytest.mark.parametrize("provider", ["vertex_ai"]) # "vertex_ai_beta" @pytest.mark.parametrize("sync_mode", [True, False]) @pytest.mark.flaky(retries=3, delay=1) @pytest.mark.asyncio async def test_gemini_pro_vision(provider, sync_mode): try: load_vertex_ai_credentials() litellm.set_verbose = True litellm.num_retries = 3 if sync_mode: resp = litellm.completion( model="{}/gemini-1.5-flash-preview-0514".format(provider), messages=[ {"role": "system", "content": "Be a good bot"}, { "role": "user", "content": [ {"type": "text", "text": "Whats in this image?"}, { "type": "image_url", "image_url": { "url": "gs://cloud-samples-data/generative-ai/image/boats.jpeg" }, }, ], }, ], ) else: resp = await litellm.acompletion( model="{}/gemini-1.5-flash-preview-0514".format(provider), messages=[ {"role": "system", "content": "Be a good bot"}, { "role": "user", "content": [ {"type": "text", "text": "Whats in this image?"}, { "type": "image_url", "image_url": { "url": "gs://cloud-samples-data/generative-ai/image/boats.jpeg" }, }, ], }, ], ) print(resp) prompt_tokens = resp.usage.prompt_tokens # DO Not DELETE this ASSERT # Google counts the prompt tokens for us, we should ensure we use the tokens from the orignal response assert prompt_tokens == 267 # the gemini api returns 267 to us except litellm.RateLimitError as e: pass except Exception as e: if "500 Internal error encountered.'" in str(e): pass else: pytest.fail(f"An exception occurred - {str(e)}") # test_gemini_pro_vision() @pytest.mark.parametrize("load_pdf", [False]) # True, @pytest.mark.flaky(retries=3, delay=1) def test_completion_function_plus_pdf(load_pdf): litellm.set_verbose = True load_vertex_ai_credentials() try: import base64 import requests # URL of the file url = "https://storage.googleapis.com/cloud-samples-data/generative-ai/pdf/2403.05530.pdf" # Download the file if load_pdf: response = requests.get(url) file_data = response.content encoded_file = base64.b64encode(file_data).decode("utf-8") url = f"data:application/pdf;base64,{encoded_file}" image_content = [ {"type": "text", "text": "What's this file about?"}, { "type": "image_url", "image_url": {"url": url}, }, ] image_message = {"role": "user", "content": image_content} response = completion( model="vertex_ai_beta/gemini-1.5-flash-preview-0514", messages=[image_message], stream=False, ) print(response) except litellm.InternalServerError as e: pass except Exception as e: pytest.fail("Got={}".format(str(e))) def encode_image(image_path): import base64 with open(image_path, "rb") as image_file: return base64.b64encode(image_file.read()).decode("utf-8") @pytest.mark.skip( reason="we already test gemini-pro-vision, this is just another way to pass images" ) def test_gemini_pro_vision_base64(): try: load_vertex_ai_credentials() litellm.set_verbose = True image_path = "../proxy/cached_logo.jpg" # Getting the base64 string base64_image = encode_image(image_path) resp = litellm.completion( model="vertex_ai/gemini-1.5-pro", messages=[ { "role": "user", "content": [ {"type": "text", "text": "Whats in this image?"}, { "type": "image_url", "image_url": { "url": "data:image/jpeg;base64," + base64_image }, }, ], } ], ) print(resp) prompt_tokens = resp.usage.prompt_tokens except litellm.InternalServerError: pass except litellm.RateLimitError as e: pass except Exception as e: if "500 Internal error encountered.'" in str(e): pass else: pytest.fail(f"An exception occurred - {str(e)}") def vertex_httpx_grounding_post(*args, **kwargs): mock_response = MagicMock() mock_response.status_code = 200 mock_response.headers = {"Content-Type": "application/json"} mock_response.json.return_value = { "candidates": [ { "content": { "role": "model", "parts": [ { "text": "Argentina won the FIFA World Cup 2022. Argentina defeated France 4-2 on penalties in the FIFA World Cup 2022 final tournament for the first time after 36 years and the third time overall." } ], }, "finishReason": "STOP", "safetyRatings": [ { "category": "HARM_CATEGORY_HATE_SPEECH", "probability": "NEGLIGIBLE", "probabilityScore": 0.14940722, "severity": "HARM_SEVERITY_NEGLIGIBLE", "severityScore": 0.07477004, }, { "category": "HARM_CATEGORY_DANGEROUS_CONTENT", "probability": "NEGLIGIBLE", "probabilityScore": 0.15636235, "severity": "HARM_SEVERITY_NEGLIGIBLE", "severityScore": 0.015967654, }, { "category": "HARM_CATEGORY_HARASSMENT", "probability": "NEGLIGIBLE", "probabilityScore": 0.1943678, "severity": "HARM_SEVERITY_NEGLIGIBLE", "severityScore": 0.1284158, }, { "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "probability": "NEGLIGIBLE", "probabilityScore": 0.09384396, "severity": "HARM_SEVERITY_NEGLIGIBLE", "severityScore": 0.0726367, }, ], "groundingMetadata": { "webSearchQueries": ["who won the world cup 2022"], "groundingAttributions": [ { "segment": {"endIndex": 38}, "confidenceScore": 0.9919262, "web": { "uri": "https://www.careerpower.in/fifa-world-cup-winners-list.html", "title": "FIFA World Cup Winners List from 1930 to 2022, Complete List - Career Power", }, }, { "segment": {"endIndex": 38}, "confidenceScore": 0.9919262, "web": { "uri": "https://www.careerpower.in/fifa-world-cup-winners-list.html", "title": "FIFA World Cup Winners List from 1930 to 2022, Complete List - Career Power", }, }, { "segment": {"endIndex": 38}, "confidenceScore": 0.9919262, "web": { "uri": "https://www.britannica.com/sports/2022-FIFA-World-Cup", "title": "2022 FIFA World Cup | Qatar, Controversy, Stadiums, Winner, & Final - Britannica", }, }, { "segment": {"endIndex": 38}, "confidenceScore": 0.9919262, "web": { "uri": "https://en.wikipedia.org/wiki/2022_FIFA_World_Cup_final", "title": "2022 FIFA World Cup final - Wikipedia", }, }, { "segment": {"endIndex": 38}, "confidenceScore": 0.9919262, "web": { "uri": "https://www.transfermarkt.com/2022-world-cup/erfolge/pokalwettbewerb/WM22", "title": "2022 World Cup - All winners - Transfermarkt", }, }, { "segment": {"startIndex": 39, "endIndex": 187}, "confidenceScore": 0.9919262, "web": { "uri": "https://www.careerpower.in/fifa-world-cup-winners-list.html", "title": "FIFA World Cup Winners List from 1930 to 2022, Complete List - Career Power", }, }, { "segment": {"startIndex": 39, "endIndex": 187}, "confidenceScore": 0.9919262, "web": { "uri": "https://en.wikipedia.org/wiki/2022_FIFA_World_Cup_final", "title": "2022 FIFA World Cup final - Wikipedia", }, }, ], "searchEntryPoint": { "renderedContent": '\u003cstyle\u003e\n.container {\n align-items: center;\n border-radius: 8px;\n display: flex;\n font-family: Google Sans, Roboto, sans-serif;\n font-size: 14px;\n line-height: 20px;\n padding: 8px 12px;\n}\n.chip {\n display: inline-block;\n border: solid 1px;\n border-radius: 16px;\n min-width: 14px;\n padding: 5px 16px;\n text-align: center;\n user-select: none;\n margin: 0 8px;\n -webkit-tap-highlight-color: transparent;\n}\n.carousel {\n overflow: auto;\n scrollbar-width: none;\n white-space: nowrap;\n margin-right: -12px;\n}\n.headline {\n display: flex;\n margin-right: 4px;\n}\n.gradient-container {\n position: relative;\n}\n.gradient {\n position: absolute;\n transform: translate(3px, -9px);\n height: 36px;\n width: 9px;\n}\n@media (prefers-color-scheme: light) {\n .container {\n background-color: #fafafa;\n box-shadow: 0 0 0 1px #0000000f;\n }\n .headline-label {\n color: #1f1f1f;\n }\n .chip {\n background-color: #ffffff;\n border-color: #d2d2d2;\n color: #5e5e5e;\n text-decoration: none;\n }\n .chip:hover {\n background-color: #f2f2f2;\n }\n .chip:focus {\n background-color: #f2f2f2;\n }\n .chip:active {\n background-color: #d8d8d8;\n border-color: #b6b6b6;\n }\n .logo-dark {\n display: none;\n }\n .gradient {\n background: linear-gradient(90deg, #fafafa 15%, #fafafa00 100%);\n }\n}\n@media (prefers-color-scheme: dark) {\n .container {\n background-color: #1f1f1f;\n box-shadow: 0 0 0 1px #ffffff26;\n }\n .headline-label {\n color: #fff;\n }\n .chip {\n background-color: #2c2c2c;\n border-color: #3c4043;\n color: #fff;\n text-decoration: none;\n }\n .chip:hover {\n background-color: #353536;\n }\n .chip:focus {\n background-color: #353536;\n }\n .chip:active {\n background-color: #464849;\n border-color: #53575b;\n }\n .logo-light {\n display: none;\n }\n .gradient {\n background: linear-gradient(90deg, #1f1f1f 15%, #1f1f1f00 100%);\n }\n}\n\u003c/style\u003e\n\u003cdiv class="container"\u003e\n \u003cdiv class="headline"\u003e\n \u003csvg class="logo-light" width="18" height="18" viewBox="9 9 35 35" fill="none" xmlns="http://www.w3.org/2000/svg"\u003e\n \u003cpath fill-rule="evenodd" clip-rule="evenodd" d="M42.8622 27.0064C42.8622 25.7839 42.7525 24.6084 42.5487 23.4799H26.3109V30.1568H35.5897C35.1821 32.3041 33.9596 34.1222 32.1258 35.3448V39.6864H37.7213C40.9814 36.677 42.8622 32.2571 42.8622 27.0064V27.0064Z" fill="#4285F4"/\u003e\n \u003cpath fill-rule="evenodd" clip-rule="evenodd" d="M26.3109 43.8555C30.9659 43.8555 34.8687 42.3195 37.7213 39.6863L32.1258 35.3447C30.5898 36.3792 28.6306 37.0061 26.3109 37.0061C21.8282 37.0061 18.0195 33.9811 16.6559 29.906H10.9194V34.3573C13.7563 39.9841 19.5712 43.8555 26.3109 43.8555V43.8555Z" fill="#34A853"/\u003e\n \u003cpath fill-rule="evenodd" clip-rule="evenodd" d="M16.6559 29.8904C16.3111 28.8559 16.1074 27.7588 16.1074 26.6146C16.1074 25.4704 16.3111 24.3733 16.6559 23.3388V18.8875H10.9194C9.74388 21.2072 9.06992 23.8247 9.06992 26.6146C9.06992 29.4045 9.74388 32.022 10.9194 34.3417L15.3864 30.8621L16.6559 29.8904V29.8904Z" fill="#FBBC05"/\u003e\n \u003cpath fill-rule="evenodd" clip-rule="evenodd" d="M26.3109 16.2386C28.85 16.2386 31.107 17.1164 32.9095 18.8091L37.8466 13.8719C34.853 11.082 30.9659 9.3736 26.3109 9.3736C19.5712 9.3736 13.7563 13.245 10.9194 18.8875L16.6559 23.3388C18.0195 19.2636 21.8282 16.2386 26.3109 16.2386V16.2386Z" fill="#EA4335"/\u003e\n \u003c/svg\u003e\n \u003csvg class="logo-dark" width="18" height="18" viewBox="0 0 48 48" xmlns="http://www.w3.org/2000/svg"\u003e\n \u003ccircle cx="24" cy="23" fill="#FFF" r="22"/\u003e\n \u003cpath d="M33.76 34.26c2.75-2.56 4.49-6.37 4.49-11.26 0-.89-.08-1.84-.29-3H24.01v5.99h8.03c-.4 2.02-1.5 3.56-3.07 4.56v.75l3.91 2.97h.88z" fill="#4285F4"/\u003e\n \u003cpath d="M15.58 25.77A8.845 8.845 0 0 0 24 31.86c1.92 0 3.62-.46 4.97-1.31l4.79 3.71C31.14 36.7 27.65 38 24 38c-5.93 0-11.01-3.4-13.45-8.36l.17-1.01 4.06-2.85h.8z" fill="#34A853"/\u003e\n \u003cpath d="M15.59 20.21a8.864 8.864 0 0 0 0 5.58l-5.03 3.86c-.98-2-1.53-4.25-1.53-6.64 0-2.39.55-4.64 1.53-6.64l1-.22 3.81 2.98.22 1.08z" fill="#FBBC05"/\u003e\n \u003cpath d="M24 14.14c2.11 0 4.02.75 5.52 1.98l4.36-4.36C31.22 9.43 27.81 8 24 8c-5.93 0-11.01 3.4-13.45 8.36l5.03 3.85A8.86 8.86 0 0 1 24 14.14z" fill="#EA4335"/\u003e\n \u003c/svg\u003e\n \u003cdiv class="gradient-container"\u003e\u003cdiv class="gradient"\u003e\u003c/div\u003e\u003c/div\u003e\n \u003c/div\u003e\n \u003cdiv class="carousel"\u003e\n \u003ca class="chip" href="https://www.google.com/search?q=who+won+the+world+cup+2022&client=app-vertex-grounding&safesearch=active"\u003ewho won the world cup 2022\u003c/a\u003e\n \u003c/div\u003e\n\u003c/div\u003e\n' }, }, } ], "usageMetadata": { "promptTokenCount": 6, "candidatesTokenCount": 48, "totalTokenCount": 54, }, } return mock_response @pytest.mark.parametrize("value_in_dict", [{}, {"disable_attribution": False}]) # def test_gemini_pro_grounding(value_in_dict): try: load_vertex_ai_credentials() litellm.set_verbose = True tools = [{"googleSearchRetrieval": value_in_dict}] litellm.set_verbose = True from litellm.llms.custom_httpx.http_handler import HTTPHandler client = HTTPHandler() with patch.object( client, "post", side_effect=vertex_httpx_grounding_post ) as mock_call: resp = litellm.completion( model="vertex_ai_beta/gemini-1.0-pro-001", messages=[{"role": "user", "content": "Who won the world cup?"}], tools=tools, client=client, ) mock_call.assert_called_once() print(mock_call.call_args.kwargs["json"]["tools"][0]) assert ( "googleSearchRetrieval" in mock_call.call_args.kwargs["json"]["tools"][0] ) assert ( mock_call.call_args.kwargs["json"]["tools"][0]["googleSearchRetrieval"] == value_in_dict ) assert "vertex_ai_grounding_metadata" in resp._hidden_params assert isinstance(resp._hidden_params["vertex_ai_grounding_metadata"], list) except litellm.InternalServerError: pass except litellm.RateLimitError: pass # @pytest.mark.skip(reason="exhausted vertex quota. need to refactor to mock the call") @pytest.mark.parametrize( "model", ["vertex_ai_beta/gemini-1.5-pro", "vertex_ai/claude-3-sonnet@20240229"] ) # "vertex_ai", @pytest.mark.parametrize("sync_mode", [True]) # "vertex_ai", @pytest.mark.asyncio @pytest.mark.flaky(retries=3, delay=1) async def test_gemini_pro_function_calling_httpx(model, sync_mode): try: load_vertex_ai_credentials() litellm.set_verbose = True messages = [ { "role": "system", "content": "Your name is Litellm Bot, you are a helpful assistant", }, # User asks for their name and weather in San Francisco { "role": "user", "content": "Hello, what is your name and can you tell me the weather?", }, ] tools = [ { "type": "function", "function": { "name": "get_weather", "description": "Get the current weather in a given location", "parameters": { "type": "object", "properties": { "location": { "type": "string", "description": "The city and state, e.g. San Francisco, CA", } }, "required": ["location"], }, }, } ] data = { "model": model, "messages": messages, "tools": tools, "tool_choice": "required", } print(f"Model for call - {model}") if sync_mode: response = litellm.completion(**data) else: response = await litellm.acompletion(**data) print(f"response: {response}") assert response.choices[0].message.tool_calls[0].function.arguments is not None assert isinstance( response.choices[0].message.tool_calls[0].function.arguments, str ) except litellm.RateLimitError as e: pass except Exception as e: if "429 Quota exceeded" in str(e): pass else: pytest.fail("An unexpected exception occurred - {}".format(str(e))) from test_completion import response_format_tests @pytest.mark.parametrize( "model", [ "vertex_ai/mistral-large@2407", "vertex_ai/mistral-nemo@2407", "vertex_ai/codestral@2405", "vertex_ai/meta/llama3-405b-instruct-maas", ], # ) # "vertex_ai", @pytest.mark.parametrize( "sync_mode", [True, False], ) # @pytest.mark.flaky(retries=3, delay=1) @pytest.mark.asyncio async def test_partner_models_httpx(model, sync_mode): try: load_vertex_ai_credentials() litellm.set_verbose = True messages = [ { "role": "system", "content": "Your name is Litellm Bot, you are a helpful assistant", }, # User asks for their name and weather in San Francisco { "role": "user", "content": "Hello, what is your name and can you tell me the weather?", }, ] data = { "model": model, "messages": messages, "timeout": 10, } if sync_mode: response = litellm.completion(**data) else: response = await litellm.acompletion(**data) response_format_tests(response=response) print(f"response: {response}") assert isinstance(response._hidden_params["response_cost"], float) except litellm.RateLimitError as e: pass except litellm.Timeout as e: pass except litellm.InternalServerError as e: pass except litellm.APIConnectionError as e: pass except litellm.ServiceUnavailableError as e: pass except Exception as e: if "429 Quota exceeded" in str(e): pass else: pytest.fail("An unexpected exception occurred - {}".format(str(e))) @pytest.mark.parametrize( "model", [ "vertex_ai/mistral-large@2407", "vertex_ai/meta/llama3-405b-instruct-maas", ], # ) # "vertex_ai", @pytest.mark.parametrize( "sync_mode", [True, False], # ) # @pytest.mark.asyncio @pytest.mark.flaky(retries=3, delay=1) async def test_partner_models_httpx_streaming(model, sync_mode): try: load_vertex_ai_credentials() litellm.set_verbose = True messages = [ { "role": "system", "content": "Your name is Litellm Bot, you are a helpful assistant", }, # User asks for their name and weather in San Francisco { "role": "user", "content": "Hello, what is your name and can you tell me the weather?", }, ] data = {"model": model, "messages": messages, "stream": True} if sync_mode: response = litellm.completion(**data) for idx, chunk in enumerate(response): streaming_format_tests(idx=idx, chunk=chunk) else: response = await litellm.acompletion(**data) idx = 0 async for chunk in response: streaming_format_tests(idx=idx, chunk=chunk) idx += 1 print(f"response: {response}") except litellm.RateLimitError as e: pass except litellm.InternalServerError as e: pass except Exception as e: if "429 Quota exceeded" in str(e): pass else: pytest.fail("An unexpected exception occurred - {}".format(str(e))) def vertex_httpx_mock_reject_prompt_post(*args, **kwargs): mock_response = MagicMock() mock_response.status_code = 200 mock_response.headers = {"Content-Type": "application/json"} mock_response.json.return_value = { "promptFeedback": {"blockReason": "OTHER"}, "usageMetadata": {"promptTokenCount": 6285, "totalTokenCount": 6285}, } return mock_response # @pytest.mark.skip(reason="exhausted vertex quota. need to refactor to mock the call") def vertex_httpx_mock_post(url, data=None, json=None, headers=None): mock_response = MagicMock() mock_response.status_code = 200 mock_response.headers = {"Content-Type": "application/json"} mock_response.json.return_value = { "candidates": [ { "finishReason": "RECITATION", "safetyRatings": [ { "category": "HARM_CATEGORY_HATE_SPEECH", "probability": "NEGLIGIBLE", "probabilityScore": 0.14965563, "severity": "HARM_SEVERITY_NEGLIGIBLE", "severityScore": 0.13660839, }, { "category": "HARM_CATEGORY_DANGEROUS_CONTENT", "probability": "NEGLIGIBLE", "probabilityScore": 0.16344544, "severity": "HARM_SEVERITY_NEGLIGIBLE", "severityScore": 0.10230471, }, { "category": "HARM_CATEGORY_HARASSMENT", "probability": "NEGLIGIBLE", "probabilityScore": 0.1979091, "severity": "HARM_SEVERITY_NEGLIGIBLE", "severityScore": 0.06052939, }, { "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "probability": "NEGLIGIBLE", "probabilityScore": 0.1765296, "severity": "HARM_SEVERITY_NEGLIGIBLE", "severityScore": 0.18417984, }, ], "citationMetadata": { "citations": [ { "startIndex": 251, "endIndex": 380, "uri": "https://chocolatecake2023.blogspot.com/2023/02/taste-deliciousness-of-perfectly-baked.html?m=1", }, { "startIndex": 393, "endIndex": 535, "uri": "https://skinnymixes.co.uk/blogs/food-recipes/peanut-butter-cup-cookies", }, { "startIndex": 439, "endIndex": 581, "uri": "https://mast-producing-trees.org/aldis-chocolate-chips-are-peanut-and-tree-nut-free/", }, { "startIndex": 1117, "endIndex": 1265, "uri": "https://github.com/frdrck100/To_Do_Assignments", }, { "startIndex": 1146, "endIndex": 1288, "uri": "https://skinnymixes.co.uk/blogs/food-recipes/peanut-butter-cup-cookies", }, { "startIndex": 1166, "endIndex": 1299, "uri": "https://www.girlversusdough.com/brookies/", }, { "startIndex": 1780, "endIndex": 1909, "uri": "https://chocolatecake2023.blogspot.com/2023/02/taste-deliciousness-of-perfectly-baked.html?m=1", }, { "startIndex": 1834, "endIndex": 1964, "uri": "https://newsd.in/national-cream-cheese-brownie-day-2023-date-history-how-to-make-a-cream-cheese-brownie/", }, { "startIndex": 1846, "endIndex": 1989, "uri": "https://github.com/frdrck100/To_Do_Assignments", }, { "startIndex": 2121, "endIndex": 2261, "uri": "https://recipes.net/copycat/hardee/hardees-chocolate-chip-cookie-recipe/", }, { "startIndex": 2505, "endIndex": 2671, "uri": "https://www.tfrecipes.com/Oranges%20with%20dried%20cherries/", }, { "startIndex": 3390, "endIndex": 3529, "uri": "https://github.com/quantumcognition/Crud-palm", }, { "startIndex": 3568, "endIndex": 3724, "uri": "https://recipes.net/dessert/cakes/ultimate-easy-gingerbread/", }, { "startIndex": 3640, "endIndex": 3770, "uri": "https://recipes.net/dessert/cookies/soft-and-chewy-peanut-butter-cookies/", }, ] }, } ], "usageMetadata": {"promptTokenCount": 336, "totalTokenCount": 336}, } return mock_response @pytest.mark.parametrize("provider", ["vertex_ai_beta"]) # "vertex_ai", @pytest.mark.parametrize("content_filter_type", ["prompt", "response"]) # "vertex_ai", @pytest.mark.asyncio @pytest.mark.flaky(retries=3, delay=1) async def test_gemini_pro_json_schema_httpx_content_policy_error( provider, content_filter_type ): load_vertex_ai_credentials() litellm.set_verbose = True messages = [ { "role": "user", "content": """ List 5 popular cookie recipes. Using this JSON schema: ```json {'$defs': {'Recipe': {'properties': {'recipe_name': {'examples': ['Chocolate Chip Cookies', 'Peanut Butter Cookies'], 'maxLength': 100, 'title': 'The recipe name', 'type': 'string'}, 'estimated_time': {'anyOf': [{'minimum': 0, 'type': 'integer'}, {'type': 'null'}], 'default': None, 'description': 'The estimated time to make the recipe in minutes', 'examples': [30, 45], 'title': 'The estimated time'}, 'ingredients': {'examples': [['flour', 'sugar', 'chocolate chips'], ['peanut butter', 'sugar', 'eggs']], 'items': {'type': 'string'}, 'maxItems': 10, 'title': 'The ingredients', 'type': 'array'}, 'instructions': {'examples': [['mix', 'bake'], ['mix', 'chill', 'bake']], 'items': {'type': 'string'}, 'maxItems': 10, 'title': 'The instructions', 'type': 'array'}}, 'required': ['recipe_name', 'ingredients', 'instructions'], 'title': 'Recipe', 'type': 'object'}}, 'properties': {'recipes': {'items': {'$ref': '#/$defs/Recipe'}, 'maxItems': 11, 'title': 'The recipes', 'type': 'array'}}, 'required': ['recipes'], 'title': 'MyRecipes', 'type': 'object'} ``` """, } ] from litellm.llms.custom_httpx.http_handler import HTTPHandler client = HTTPHandler() if content_filter_type == "prompt": _side_effect = vertex_httpx_mock_reject_prompt_post else: _side_effect = vertex_httpx_mock_post with patch.object(client, "post", side_effect=_side_effect) as mock_call: response = completion( model="vertex_ai_beta/gemini-1.5-flash", messages=messages, response_format={"type": "json_object"}, client=client, ) assert response.choices[0].finish_reason == "content_filter" mock_call.assert_called_once() def vertex_httpx_mock_post_valid_response(*args, **kwargs): mock_response = MagicMock() mock_response.status_code = 200 mock_response.headers = {"Content-Type": "application/json"} mock_response.json.return_value = { "candidates": [ { "content": { "role": "model", "parts": [ { "text": """{ "recipes": [ {"recipe_name": "Chocolate Chip Cookies"}, {"recipe_name": "Oatmeal Raisin Cookies"}, {"recipe_name": "Peanut Butter Cookies"}, {"recipe_name": "Sugar Cookies"}, {"recipe_name": "Snickerdoodles"} ] }""" } ], }, "finishReason": "STOP", "safetyRatings": [ { "category": "HARM_CATEGORY_HATE_SPEECH", "probability": "NEGLIGIBLE", "probabilityScore": 0.09790669, "severity": "HARM_SEVERITY_NEGLIGIBLE", "severityScore": 0.11736965, }, { "category": "HARM_CATEGORY_DANGEROUS_CONTENT", "probability": "NEGLIGIBLE", "probabilityScore": 0.1261379, "severity": "HARM_SEVERITY_NEGLIGIBLE", "severityScore": 0.08601588, }, { "category": "HARM_CATEGORY_HARASSMENT", "probability": "NEGLIGIBLE", "probabilityScore": 0.083441176, "severity": "HARM_SEVERITY_NEGLIGIBLE", "severityScore": 0.0355444, }, { "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "probability": "NEGLIGIBLE", "probabilityScore": 0.071981624, "severity": "HARM_SEVERITY_NEGLIGIBLE", "severityScore": 0.08108212, }, ], } ], "usageMetadata": { "promptTokenCount": 60, "candidatesTokenCount": 55, "totalTokenCount": 115, }, } return mock_response def vertex_httpx_mock_post_valid_response_anthropic(*args, **kwargs): mock_response = MagicMock() mock_response.status_code = 200 mock_response.headers = {"Content-Type": "application/json"} mock_response.json.return_value = { "id": "msg_vrtx_013Wki5RFQXAspL7rmxRFjZg", "type": "message", "role": "assistant", "model": "claude-3-5-sonnet-20240620", "content": [ { "type": "tool_use", "id": "toolu_vrtx_01YMnYZrToPPfcmY2myP2gEB", "name": "json_tool_call", "input": { "values": { "recipes": [ {"recipe_name": "Chocolate Chip Cookies"}, {"recipe_name": "Oatmeal Raisin Cookies"}, {"recipe_name": "Peanut Butter Cookies"}, {"recipe_name": "Snickerdoodle Cookies"}, {"recipe_name": "Sugar Cookies"}, ] } }, } ], "stop_reason": "tool_use", "stop_sequence": None, "usage": {"input_tokens": 368, "output_tokens": 118}, } return mock_response def vertex_httpx_mock_post_invalid_schema_response(*args, **kwargs): mock_response = MagicMock() mock_response.status_code = 200 mock_response.headers = {"Content-Type": "application/json"} mock_response.json.return_value = { "candidates": [ { "content": { "role": "model", "parts": [ {"text": '[{"recipe_world": "Chocolate Chip Cookies"}]\n'} ], }, "finishReason": "STOP", "safetyRatings": [ { "category": "HARM_CATEGORY_HATE_SPEECH", "probability": "NEGLIGIBLE", "probabilityScore": 0.09790669, "severity": "HARM_SEVERITY_NEGLIGIBLE", "severityScore": 0.11736965, }, { "category": "HARM_CATEGORY_DANGEROUS_CONTENT", "probability": "NEGLIGIBLE", "probabilityScore": 0.1261379, "severity": "HARM_SEVERITY_NEGLIGIBLE", "severityScore": 0.08601588, }, { "category": "HARM_CATEGORY_HARASSMENT", "probability": "NEGLIGIBLE", "probabilityScore": 0.083441176, "severity": "HARM_SEVERITY_NEGLIGIBLE", "severityScore": 0.0355444, }, { "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "probability": "NEGLIGIBLE", "probabilityScore": 0.071981624, "severity": "HARM_SEVERITY_NEGLIGIBLE", "severityScore": 0.08108212, }, ], } ], "usageMetadata": { "promptTokenCount": 60, "candidatesTokenCount": 55, "totalTokenCount": 115, }, } return mock_response def vertex_httpx_mock_post_invalid_schema_response_anthropic(*args, **kwargs): mock_response = MagicMock() mock_response.status_code = 200 mock_response.headers = {"Content-Type": "application/json"} mock_response.json.return_value = { "id": "msg_vrtx_013Wki5RFQXAspL7rmxRFjZg", "type": "message", "role": "assistant", "model": "claude-3-5-sonnet-20240620", "content": [{"text": "Hi! My name is Claude.", "type": "text"}], "stop_reason": "end_turn", "stop_sequence": None, "usage": {"input_tokens": 368, "output_tokens": 118}, } return mock_response @pytest.mark.parametrize( "model, vertex_location, supports_response_schema", [ ("vertex_ai_beta/gemini-1.5-pro-001", "us-central1", True), ("gemini/gemini-1.5-pro", None, True), ("vertex_ai_beta/gemini-1.5-flash", "us-central1", True), ("vertex_ai/claude-3-5-sonnet@20240620", "us-east5", False), ], ) @pytest.mark.parametrize( "invalid_response", [True, False], ) @pytest.mark.parametrize( "enforce_validation", [True, False], ) @pytest.mark.asyncio async def test_gemini_pro_json_schema_args_sent_httpx( model, supports_response_schema, vertex_location, invalid_response, enforce_validation, ): load_vertex_ai_credentials() os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True" litellm.model_cost = litellm.get_model_cost_map(url="") litellm.set_verbose = True messages = [{"role": "user", "content": "List 5 cookie recipes"}] from litellm.llms.custom_httpx.http_handler import HTTPHandler response_schema = { "type": "object", "properties": { "recipes": { "type": "array", "items": { "type": "object", "properties": {"recipe_name": {"type": "string"}}, "required": ["recipe_name"], }, } }, "required": ["recipes"], "additionalProperties": False, } client = HTTPHandler() httpx_response = MagicMock() if invalid_response is True: if "claude" in model: httpx_response.side_effect = ( vertex_httpx_mock_post_invalid_schema_response_anthropic ) else: httpx_response.side_effect = vertex_httpx_mock_post_invalid_schema_response else: if "claude" in model: httpx_response.side_effect = vertex_httpx_mock_post_valid_response_anthropic else: httpx_response.side_effect = vertex_httpx_mock_post_valid_response with patch.object(client, "post", new=httpx_response) as mock_call: print("SENDING CLIENT POST={}".format(client.post)) try: resp = completion( model=model, messages=messages, response_format={ "type": "json_object", "response_schema": response_schema, "enforce_validation": enforce_validation, }, vertex_location=vertex_location, client=client, ) print("Received={}".format(resp)) if invalid_response is True and enforce_validation is True: pytest.fail("Expected this to fail") except litellm.JSONSchemaValidationError as e: if invalid_response is False: pytest.fail("Expected this to pass. Got={}".format(e)) mock_call.assert_called_once() if "claude" not in model: print(mock_call.call_args.kwargs) print(mock_call.call_args.kwargs["json"]["generationConfig"]) if supports_response_schema: assert ( "response_schema" in mock_call.call_args.kwargs["json"]["generationConfig"] ) else: assert ( "response_schema" not in mock_call.call_args.kwargs["json"]["generationConfig"] ) assert ( "Use this JSON schema:" in mock_call.call_args.kwargs["json"]["contents"][0]["parts"][1][ "text" ] ) @pytest.mark.parametrize( "model, vertex_location, supports_response_schema", [ ("vertex_ai_beta/gemini-1.5-pro-001", "us-central1", True), ("gemini/gemini-1.5-pro", None, True), ("vertex_ai_beta/gemini-1.5-flash", "us-central1", True), ("vertex_ai/claude-3-5-sonnet@20240620", "us-east5", False), ], ) @pytest.mark.parametrize( "invalid_response", [True, False], ) @pytest.mark.parametrize( "enforce_validation", [True, False], ) @pytest.mark.asyncio async def test_gemini_pro_json_schema_args_sent_httpx_openai_schema( model, supports_response_schema, vertex_location, invalid_response, enforce_validation, ): from typing import List if enforce_validation: litellm.enable_json_schema_validation = True from pydantic import BaseModel load_vertex_ai_credentials() os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True" litellm.model_cost = litellm.get_model_cost_map(url="") litellm.set_verbose = True messages = [{"role": "user", "content": "List 5 cookie recipes"}] from litellm.llms.custom_httpx.http_handler import HTTPHandler class Recipe(BaseModel): recipe_name: str class ResponseSchema(BaseModel): recipes: List[Recipe] client = HTTPHandler() httpx_response = MagicMock() if invalid_response is True: if "claude" in model: httpx_response.side_effect = ( vertex_httpx_mock_post_invalid_schema_response_anthropic ) else: httpx_response.side_effect = vertex_httpx_mock_post_invalid_schema_response else: if "claude" in model: httpx_response.side_effect = vertex_httpx_mock_post_valid_response_anthropic else: httpx_response.side_effect = vertex_httpx_mock_post_valid_response with patch.object(client, "post", new=httpx_response) as mock_call: print("SENDING CLIENT POST={}".format(client.post)) try: resp = completion( model=model, messages=messages, response_format=ResponseSchema, vertex_location=vertex_location, client=client, ) print("Received={}".format(resp)) if invalid_response is True and enforce_validation is True: pytest.fail("Expected this to fail") except litellm.JSONSchemaValidationError as e: if invalid_response is False: pytest.fail("Expected this to pass. Got={}".format(e)) mock_call.assert_called_once() if "claude" not in model: print(mock_call.call_args.kwargs) print(mock_call.call_args.kwargs["json"]["generationConfig"]) if supports_response_schema: assert ( "response_schema" in mock_call.call_args.kwargs["json"]["generationConfig"] ) assert ( "response_mime_type" in mock_call.call_args.kwargs["json"]["generationConfig"] ) assert ( mock_call.call_args.kwargs["json"]["generationConfig"][ "response_mime_type" ] == "application/json" ) else: assert ( "response_schema" not in mock_call.call_args.kwargs["json"]["generationConfig"] ) assert ( "Use this JSON schema:" in mock_call.call_args.kwargs["json"]["contents"][0]["parts"][1][ "text" ] ) @pytest.mark.parametrize( "model", ["gemini-1.5-flash", "claude-3-sonnet@20240229"] ) # "vertex_ai", @pytest.mark.asyncio async def test_gemini_pro_httpx_custom_api_base(model): load_vertex_ai_credentials() litellm.set_verbose = True messages = [ { "role": "user", "content": "Hello world", } ] from litellm.llms.custom_httpx.http_handler import HTTPHandler client = HTTPHandler() with patch.object(client, "post", new=MagicMock()) as mock_call: try: response = completion( model="vertex_ai/{}".format(model), messages=messages, response_format={"type": "json_object"}, client=client, api_base="my-custom-api-base", extra_headers={"hello": "world"}, ) except Exception as e: traceback.print_exc() print("Receives error - {}".format(str(e))) mock_call.assert_called_once() print(f"mock_call.call_args: {mock_call.call_args}") print(f"mock_call.call_args.kwargs: {mock_call.call_args.kwargs}") if "url" in mock_call.call_args.kwargs: assert ( "my-custom-api-base:generateContent" == mock_call.call_args.kwargs["url"] ) else: assert "my-custom-api-base:rawPredict" == mock_call.call_args[0][0] if "headers" in mock_call.call_args.kwargs: assert "hello" in mock_call.call_args.kwargs["headers"] # @pytest.mark.skip(reason="exhausted vertex quota. need to refactor to mock the call") @pytest.mark.parametrize("sync_mode", [True]) @pytest.mark.parametrize("provider", ["vertex_ai"]) @pytest.mark.asyncio @pytest.mark.flaky(retries=3, delay=1) async def test_gemini_pro_function_calling(provider, sync_mode): try: load_vertex_ai_credentials() litellm.set_verbose = True messages = [ { "role": "system", "content": "Your name is Litellm Bot, you are a helpful assistant", }, # User asks for their name and weather in San Francisco { "role": "user", "content": "Hello, what is your name and can you tell me the weather?", }, # Assistant replies with a tool call { "role": "assistant", "content": "", "tool_calls": [ { "id": "call_123", "type": "function", "index": 0, "function": { "name": "get_weather", "arguments": '{"location":"San Francisco, CA"}', }, } ], }, # The result of the tool call is added to the history { "role": "tool", "tool_call_id": "call_123", "content": "27 degrees celsius and clear in San Francisco, CA", }, # Now the assistant can reply with the result of the tool call. ] tools = [ { "type": "function", "function": { "name": "get_weather", "description": "Get the current weather in a given location", "parameters": { "type": "object", "properties": { "location": { "type": "string", "description": "The city and state, e.g. San Francisco, CA", } }, "required": ["location"], }, }, } ] data = { "model": "{}/gemini-1.5-pro-preview-0514".format(provider), "messages": messages, "tools": tools, } if sync_mode: response = litellm.completion(**data) else: response = await litellm.acompletion(**data) print(f"response: {response}") except litellm.RateLimitError as e: pass except Exception as e: if "429 Quota exceeded" in str(e): pass else: pytest.fail("An unexpected exception occurred - {}".format(str(e))) # gemini_pro_function_calling() @pytest.mark.parametrize("sync_mode", [True]) @pytest.mark.asyncio @pytest.mark.flaky(retries=3, delay=1) async def test_gemini_pro_function_calling_streaming(sync_mode): load_vertex_ai_credentials() litellm.set_verbose = True data = { "model": "vertex_ai/gemini-pro", "messages": [ { "role": "user", "content": "Call the submit_cities function with San Francisco and New York", } ], "tools": [ { "type": "function", "function": { "name": "submit_cities", "description": "Submits a list of cities", "parameters": { "type": "object", "properties": { "cities": {"type": "array", "items": {"type": "string"}} }, "required": ["cities"], }, }, } ], "tool_choice": "auto", "n": 1, "stream": True, "temperature": 0.1, } chunks = [] try: if sync_mode == True: response = litellm.completion(**data) print(f"completion: {response}") for chunk in response: chunks.append(chunk) assert isinstance(chunk, litellm.ModelResponse) else: response = await litellm.acompletion(**data) print(f"completion: {response}") assert isinstance(response, litellm.CustomStreamWrapper) async for chunk in response: print(f"chunk: {chunk}") chunks.append(chunk) assert isinstance(chunk, litellm.ModelResponse) complete_response = litellm.stream_chunk_builder(chunks=chunks) assert ( complete_response.choices[0].message.content is not None or len(complete_response.choices[0].message.tool_calls) > 0 ) print(f"complete_response: {complete_response}") except litellm.APIError as e: pass except litellm.RateLimitError as e: pass @pytest.mark.asyncio @pytest.mark.flaky(retries=3, delay=1) async def test_gemini_pro_async_function_calling(): load_vertex_ai_credentials() litellm.set_verbose = True try: tools = [ { "type": "function", "function": { "name": "get_current_weather", "description": "Get the current weather in a given location.", "parameters": { "type": "object", "properties": { "location": { "type": "string", "description": "The city and state, e.g. San Francisco, CA", }, "unit": { "type": "string", "enum": ["celsius", "fahrenheit"], }, }, "required": ["location"], }, }, } ] messages = [ { "role": "user", "content": "What's the weather like in Boston today in fahrenheit?", } ] completion = await litellm.acompletion( model="gemini-pro", messages=messages, tools=tools, tool_choice="auto" ) print(f"completion: {completion}") print(f"message content: {completion.choices[0].message.content}") assert completion.choices[0].message.content is None assert len(completion.choices[0].message.tool_calls) == 1 # except litellm.APIError as e: # pass except litellm.RateLimitError as e: pass except Exception as e: pytest.fail(f"An exception occurred - {str(e)}") # raise Exception("it worked!") # asyncio.run(gemini_pro_async_function_calling()) @pytest.mark.flaky(retries=3, delay=1) @pytest.mark.parametrize("sync_mode", [True, False]) @pytest.mark.asyncio async def test_vertexai_embedding(sync_mode): try: load_vertex_ai_credentials() litellm.set_verbose = True input_text = ["good morning from litellm", "this is another item"] if sync_mode: response = litellm.embedding( model="textembedding-gecko@001", input=input_text ) else: response = await litellm.aembedding( model="textembedding-gecko@001", input=input_text ) print(f"response: {response}") # Assert that the response is not None assert response is not None # Assert that the response contains embeddings assert hasattr(response, "data") assert len(response.data) == len(input_text) # Assert that each embedding is a non-empty list of floats for embedding in response.data: assert "embedding" in embedding assert isinstance(embedding["embedding"], list) assert len(embedding["embedding"]) > 0 assert all(isinstance(x, float) for x in embedding["embedding"]) except litellm.RateLimitError as e: pass except Exception as e: pytest.fail(f"Error occurred: {e}") @pytest.mark.asyncio async def test_vertexai_multimodal_embedding(): load_vertex_ai_credentials() mock_response = AsyncMock() def return_val(): return { "predictions": [ { "imageEmbedding": [0.1, 0.2, 0.3], # Simplified example "textEmbedding": [0.4, 0.5, 0.6], # Simplified example } ] } mock_response.json = return_val mock_response.status_code = 200 expected_payload = { "instances": [ { "image": { "gcsUri": "gs://cloud-samples-data/vertex-ai/llm/prompts/landmark1.png" }, "text": "this is a unicorn", } ] } with patch( "litellm.llms.custom_httpx.http_handler.AsyncHTTPHandler.post", return_value=mock_response, ) as mock_post: # Act: Call the litellm.aembedding function response = await litellm.aembedding( model="vertex_ai/multimodalembedding@001", input=[ { "image": { "gcsUri": "gs://cloud-samples-data/vertex-ai/llm/prompts/landmark1.png" }, "text": "this is a unicorn", }, ], ) # Assert mock_post.assert_called_once() _, kwargs = mock_post.call_args args_to_vertexai = kwargs["json"] print("args to vertex ai call:", args_to_vertexai) assert args_to_vertexai == expected_payload assert response.model == "multimodalembedding@001" assert len(response.data) == 1 response_data = response.data[0] # Optional: Print for debugging print("Arguments passed to Vertex AI:", args_to_vertexai) print("Response:", response) @pytest.mark.asyncio async def test_vertexai_multimodal_embedding_text_input(): load_vertex_ai_credentials() mock_response = AsyncMock() def return_val(): return { "predictions": [ { "textEmbedding": [0.4, 0.5, 0.6], # Simplified example } ] } mock_response.json = return_val mock_response.status_code = 200 expected_payload = { "instances": [ { "text": "this is a unicorn", } ] } with patch( "litellm.llms.custom_httpx.http_handler.AsyncHTTPHandler.post", return_value=mock_response, ) as mock_post: # Act: Call the litellm.aembedding function response = await litellm.aembedding( model="vertex_ai/multimodalembedding@001", input=[ "this is a unicorn", ], ) # Assert mock_post.assert_called_once() _, kwargs = mock_post.call_args args_to_vertexai = kwargs["json"] print("args to vertex ai call:", args_to_vertexai) assert args_to_vertexai == expected_payload assert response.model == "multimodalembedding@001" assert len(response.data) == 1 response_data = response.data[0] assert response_data["embedding"] == [0.4, 0.5, 0.6] # Optional: Print for debugging print("Arguments passed to Vertex AI:", args_to_vertexai) print("Response:", response) @pytest.mark.asyncio async def test_vertexai_multimodal_embedding_image_in_input(): load_vertex_ai_credentials() mock_response = AsyncMock() def return_val(): return { "predictions": [ { "imageEmbedding": [0.1, 0.2, 0.3], # Simplified example } ] } mock_response.json = return_val mock_response.status_code = 200 expected_payload = { "instances": [ { "image": { "gcsUri": "gs://cloud-samples-data/vertex-ai/llm/prompts/landmark1.png" }, } ] } with patch( "litellm.llms.custom_httpx.http_handler.AsyncHTTPHandler.post", return_value=mock_response, ) as mock_post: # Act: Call the litellm.aembedding function response = await litellm.aembedding( model="vertex_ai/multimodalembedding@001", input=["gs://cloud-samples-data/vertex-ai/llm/prompts/landmark1.png"], ) # Assert mock_post.assert_called_once() _, kwargs = mock_post.call_args args_to_vertexai = kwargs["json"] print("args to vertex ai call:", args_to_vertexai) assert args_to_vertexai == expected_payload assert response.model == "multimodalembedding@001" assert len(response.data) == 1 response_data = response.data[0] assert response_data["embedding"] == [0.1, 0.2, 0.3] # Optional: Print for debugging print("Arguments passed to Vertex AI:", args_to_vertexai) print("Response:", response) @pytest.mark.asyncio async def test_vertexai_multimodal_embedding_base64image_in_input(): import base64 import requests load_vertex_ai_credentials() mock_response = AsyncMock() url = "https://dummyimage.com/100/100/fff&text=Test+image" response = requests.get(url) file_data = response.content encoded_file = base64.b64encode(file_data).decode("utf-8") base64_image = f"data:image/png;base64,{encoded_file}" def return_val(): return { "predictions": [ { "imageEmbedding": [0.1, 0.2, 0.3], # Simplified example } ] } mock_response.json = return_val mock_response.status_code = 200 expected_payload = { "instances": [ { "image": {"bytesBase64Encoded": base64_image}, } ] } with patch( "litellm.llms.custom_httpx.http_handler.AsyncHTTPHandler.post", return_value=mock_response, ) as mock_post: # Act: Call the litellm.aembedding function response = await litellm.aembedding( model="vertex_ai/multimodalembedding@001", input=[base64_image], ) # Assert mock_post.assert_called_once() _, kwargs = mock_post.call_args args_to_vertexai = kwargs["json"] print("args to vertex ai call:", args_to_vertexai) assert args_to_vertexai == expected_payload assert response.model == "multimodalembedding@001" assert len(response.data) == 1 response_data = response.data[0] assert response_data["embedding"] == [0.1, 0.2, 0.3] # Optional: Print for debugging print("Arguments passed to Vertex AI:", args_to_vertexai) print("Response:", response) @pytest.mark.skip( reason="new test - works locally running into vertex version issues on ci/cd" ) def test_vertexai_embedding_embedding_latest(): try: load_vertex_ai_credentials() litellm.set_verbose = True response = embedding( model="vertex_ai/text-embedding-004", input=["hi"], dimensions=1, auto_truncate=True, task_type="RETRIEVAL_QUERY", ) assert len(response.data[0]["embedding"]) == 1 assert response.usage.prompt_tokens > 0 print(f"response:", response) except litellm.RateLimitError as e: pass except Exception as e: pytest.fail(f"Error occurred: {e}") @pytest.mark.flaky(retries=3, delay=1) def test_vertexai_embedding_embedding_latest_input_type(): try: load_vertex_ai_credentials() litellm.set_verbose = True response = embedding( model="vertex_ai/text-embedding-004", input=["hi"], input_type="RETRIEVAL_QUERY", ) assert response.usage.prompt_tokens > 0 print(f"response:", response) except litellm.RateLimitError as e: pass except Exception as e: pytest.fail(f"Error occurred: {e}") @pytest.mark.asyncio @pytest.mark.flaky(retries=3, delay=1) async def test_vertexai_aembedding(): try: load_vertex_ai_credentials() # litellm.set_verbose=True response = await litellm.aembedding( model="textembedding-gecko@001", input=["good morning from litellm", "this is another item"], ) print(f"response: {response}") except litellm.RateLimitError as e: pass except Exception as e: pytest.fail(f"Error occurred: {e}") @pytest.mark.asyncio def test_tool_name_conversion(): messages = [ { "role": "system", "content": "Your name is Litellm Bot, you are a helpful assistant", }, # User asks for their name and weather in San Francisco { "role": "user", "content": "Hello, what is your name and can you tell me the weather?", }, # Assistant replies with a tool call { "role": "assistant", "content": "", "tool_calls": [ { "id": "call_123", "type": "function", "index": 0, "function": { "name": "get_weather", "arguments": '{"location":"San Francisco, CA"}', }, } ], }, # The result of the tool call is added to the history { "role": "tool", "tool_call_id": "call_123", "content": "27 degrees celsius and clear in San Francisco, CA", }, # Now the assistant can reply with the result of the tool call. ] translated_messages = _gemini_convert_messages_with_history(messages=messages) print(f"\n\ntranslated_messages: {translated_messages}\ntranslated_messages") # assert that the last tool response has the corresponding tool name assert ( translated_messages[-1]["parts"][0]["function_response"]["name"] == "get_weather" ) def test_prompt_factory(): messages = [ { "role": "system", "content": "Your name is Litellm Bot, you are a helpful assistant", }, # User asks for their name and weather in San Francisco { "role": "user", "content": "Hello, what is your name and can you tell me the weather?", }, # Assistant replies with a tool call { "role": "assistant", "content": "", "tool_calls": [ { "id": "call_123", "type": "function", "index": 0, "function": { "name": "get_weather", "arguments": '{"location":"San Francisco, CA"}', }, } ], }, # The result of the tool call is added to the history { "role": "tool", "tool_call_id": "call_123", "content": "27 degrees celsius and clear in San Francisco, CA", }, # Now the assistant can reply with the result of the tool call. ] translated_messages = _gemini_convert_messages_with_history(messages=messages) print(f"\n\ntranslated_messages: {translated_messages}\ntranslated_messages") def test_prompt_factory_nested(): messages = [ {"role": "user", "content": [{"type": "text", "text": "hi"}]}, { "role": "assistant", "content": [ {"type": "text", "text": "Hi! 👋 \n\nHow can I help you today? 😊 \n"} ], }, {"role": "user", "content": [{"type": "text", "text": "hi 2nd time"}]}, ] translated_messages = _gemini_convert_messages_with_history(messages=messages) print(f"\n\ntranslated_messages: {translated_messages}\ntranslated_messages") for message in translated_messages: assert len(message["parts"]) == 1 assert "text" in message["parts"][0], "Missing 'text' from 'parts'" assert isinstance( message["parts"][0]["text"], str ), "'text' value not a string." def test_get_token_url(): from litellm.llms.vertex_ai_and_google_ai_studio.gemini.vertex_and_google_ai_studio_gemini import ( VertexLLM, ) vertex_llm = VertexLLM() vertex_ai_project = "adroit-crow-413218" vertex_ai_location = "us-central1" json_obj = get_vertex_ai_creds_json() vertex_credentials = json.dumps(json_obj) should_use_v1beta1_features = vertex_llm.is_using_v1beta1_features( optional_params={"cached_content": "hi"} ) assert should_use_v1beta1_features is True _, url = vertex_llm._get_token_and_url( auth_header=None, vertex_project=vertex_ai_project, vertex_location=vertex_ai_location, vertex_credentials=vertex_credentials, gemini_api_key="", custom_llm_provider="vertex_ai_beta", should_use_v1beta1_features=should_use_v1beta1_features, api_base=None, model="", stream=False, ) print("url=", url) assert "/v1beta1/" in url should_use_v1beta1_features = vertex_llm.is_using_v1beta1_features( optional_params={"temperature": 0.1} ) _, url = vertex_llm._get_token_and_url( auth_header=None, vertex_project=vertex_ai_project, vertex_location=vertex_ai_location, vertex_credentials=vertex_credentials, gemini_api_key="", custom_llm_provider="vertex_ai_beta", should_use_v1beta1_features=should_use_v1beta1_features, api_base=None, model="", stream=False, ) print("url for normal request", url) assert "v1beta1" not in url assert "/v1/" in url pass @pytest.mark.asyncio async def test_completion_fine_tuned_model(): # load_vertex_ai_credentials() mock_response = AsyncMock() def return_val(): return { "candidates": [ { "content": { "role": "model", "parts": [ { "text": "A canvas vast, a boundless blue,\nWhere clouds paint tales and winds imbue.\nThe sun descends in fiery hue,\nStars shimmer bright, a gentle few.\n\nThe moon ascends, a pearl of light,\nGuiding travelers through the night.\nThe sky embraces, holds all tight,\nA tapestry of wonder, bright." } ], }, "finishReason": "STOP", "safetyRatings": [ { "category": "HARM_CATEGORY_HATE_SPEECH", "probability": "NEGLIGIBLE", "probabilityScore": 0.028930664, "severity": "HARM_SEVERITY_NEGLIGIBLE", "severityScore": 0.041992188, }, # ... other safety ratings ... ], "avgLogprobs": -0.95772853367765187, } ], "usageMetadata": { "promptTokenCount": 7, "candidatesTokenCount": 71, "totalTokenCount": 78, }, } mock_response.json = return_val mock_response.status_code = 200 expected_payload = { "contents": [ {"role": "user", "parts": [{"text": "Write a short poem about the sky"}]} ], "generationConfig": {}, } with patch( "litellm.llms.custom_httpx.http_handler.AsyncHTTPHandler.post", return_value=mock_response, ) as mock_post: # Act: Call the litellm.completion function response = await litellm.acompletion( model="vertex_ai_beta/4965075652664360960", messages=[{"role": "user", "content": "Write a short poem about the sky"}], ) # Assert mock_post.assert_called_once() url, kwargs = mock_post.call_args print("url = ", url) # this is the fine-tuned model endpoint assert ( url[0] == "https://us-central1-aiplatform.googleapis.com/v1/projects/adroit-crow-413218/locations/us-central1/endpoints/4965075652664360960:generateContent" ) print("call args = ", kwargs) args_to_vertexai = kwargs["json"] print("args to vertex ai call:", args_to_vertexai) assert args_to_vertexai == expected_payload assert response.choices[0].message.content.startswith("A canvas vast") assert response.choices[0].finish_reason == "stop" assert response.usage.total_tokens == 78 # Optional: Print for debugging print("Arguments passed to Vertex AI:", args_to_vertexai) print("Response:", response) def mock_gemini_request(*args, **kwargs): print(f"kwargs: {kwargs}") mock_response = MagicMock() mock_response.status_code = 200 mock_response.headers = {"Content-Type": "application/json"} if "cachedContents" in kwargs["url"]: mock_response.json.return_value = { "name": "cachedContents/4d2kd477o3pg", "model": "models/gemini-1.5-flash-001", "createTime": "2024-08-26T22:31:16.147190Z", "updateTime": "2024-08-26T22:31:16.147190Z", "expireTime": "2024-08-26T22:36:15.548934784Z", "displayName": "", "usageMetadata": {"totalTokenCount": 323383}, } else: mock_response.json.return_value = { "candidates": [ { "content": { "parts": [ { "text": "Please provide me with the text of the legal agreement" } ], "role": "model", }, "finishReason": "MAX_TOKENS", "index": 0, "safetyRatings": [ { "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "probability": "NEGLIGIBLE", }, { "category": "HARM_CATEGORY_HATE_SPEECH", "probability": "NEGLIGIBLE", }, { "category": "HARM_CATEGORY_HARASSMENT", "probability": "NEGLIGIBLE", }, { "category": "HARM_CATEGORY_DANGEROUS_CONTENT", "probability": "NEGLIGIBLE", }, ], } ], "usageMetadata": { "promptTokenCount": 40049, "candidatesTokenCount": 10, "totalTokenCount": 40059, "cachedContentTokenCount": 40012, }, } return mock_response def mock_gemini_list_request(*args, **kwargs): from litellm.types.llms.vertex_ai import ( CachedContent, CachedContentListAllResponseBody, ) print(f"kwargs: {kwargs}") mock_response = MagicMock() mock_response.status_code = 200 mock_response.headers = {"Content-Type": "application/json"} mock_response.json.return_value = CachedContentListAllResponseBody( cachedContents=[CachedContent(name="test", displayName="test")] ) return mock_response import uuid @pytest.mark.parametrize( "sync_mode", [True, False], ) @pytest.mark.asyncio async def test_gemini_context_caching_anthropic_format(sync_mode): from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler litellm.set_verbose = True gemini_context_caching_messages = [ # System Message { "role": "system", "content": [ { "type": "text", "text": "Here is the full text of a complex legal agreement {}".format( uuid.uuid4() ) * 4000, "cache_control": {"type": "ephemeral"}, } ], }, # marked for caching with the cache_control parameter, so that this checkpoint can read from the previous cache. { "role": "user", "content": [ { "type": "text", "text": "What are the key terms and conditions in this agreement?", "cache_control": {"type": "ephemeral"}, } ], }, { "role": "assistant", "content": "Certainly! the key terms and conditions are the following: the contract is 1 year long for $10/mo", }, # The final turn is marked with cache-control, for continuing in followups. { "role": "user", "content": [ { "type": "text", "text": "What are the key terms and conditions in this agreement?", } ], }, ] if sync_mode: client = HTTPHandler(concurrent_limit=1) else: client = AsyncHTTPHandler(concurrent_limit=1) with patch.object(client, "post", side_effect=mock_gemini_request) as mock_client: try: if sync_mode: response = litellm.completion( model="gemini/gemini-1.5-flash-001", messages=gemini_context_caching_messages, temperature=0.2, max_tokens=10, client=client, ) else: response = await litellm.acompletion( model="gemini/gemini-1.5-flash-001", messages=gemini_context_caching_messages, temperature=0.2, max_tokens=10, client=client, ) except Exception as e: print(e) assert mock_client.call_count == 2 first_call_args = mock_client.call_args_list[0].kwargs print(f"first_call_args: {first_call_args}") assert "cachedContents" in first_call_args["url"] # assert "cache_read_input_tokens" in response.usage # assert "cache_creation_input_tokens" in response.usage # # Assert either a cache entry was created or cache was read - changes depending on the anthropic api ttl # assert (response.usage.cache_read_input_tokens > 0) or ( # response.usage.cache_creation_input_tokens > 0 # ) @pytest.mark.asyncio async def test_partner_models_httpx_ai21(): litellm.set_verbose = True model = "vertex_ai/jamba-1.5-mini@001" messages = [ { "role": "system", "content": "Your name is Litellm Bot, you are a helpful assistant", }, { "role": "user", "content": "Hello, can you tell me the weather in San Francisco?", }, ] tools = [ { "type": "function", "function": { "name": "get_weather", "description": "Get the current weather in a given location", "parameters": { "type": "object", "properties": { "location": { "type": "string", "description": "The city and state, e.g. San Francisco, CA", } }, "required": ["location"], }, }, } ] data = { "model": model, "messages": messages, "tools": tools, "top_p": 0.5, } mock_response = AsyncMock() def return_val(): return { "id": "chat-3d11cf95eb224966937b216d9494fe73", "choices": [ { "index": 0, "message": { "role": "assistant", "content": " Sure, let me check that for you.", "tool_calls": [ { "id": "b5cef16b-5946-4937-b9d5-beeaea871e77", "type": "function", "function": { "name": "get_weather", "arguments": '{"location": "San Francisco"}', }, } ], }, "finish_reason": "stop", } ], "usage": { "prompt_tokens": 158, "completion_tokens": 36, "total_tokens": 194, }, "meta": {"requestDurationMillis": 501}, "model": "jamba-1.5", } mock_response.json = return_val mock_response.status_code = 200 with patch( "litellm.llms.custom_httpx.http_handler.AsyncHTTPHandler.post", return_value=mock_response, ) as mock_post: response = await litellm.acompletion(**data) # Assert mock_post.assert_called_once() url, kwargs = mock_post.call_args print("url = ", url) print("call args = ", kwargs) print(kwargs["data"]) assert ( url[0] == "https://us-central1-aiplatform.googleapis.com/v1beta1/projects/adroit-crow-413218/locations/us-central1/publishers/ai21/models/jamba-1.5-mini@001:rawPredict" ) # json loads kwargs kwargs["data"] = json.loads(kwargs["data"]) assert kwargs["data"] == { "model": "jamba-1.5-mini", "messages": [ { "role": "system", "content": "Your name is Litellm Bot, you are a helpful assistant", }, { "role": "user", "content": "Hello, can you tell me the weather in San Francisco?", }, ], "top_p": 0.5, "tools": [ { "type": "function", "function": { "name": "get_weather", "description": "Get the current weather in a given location", "parameters": { "type": "object", "properties": { "location": { "type": "string", "description": "The city and state, e.g. San Francisco, CA", } }, "required": ["location"], }, }, } ], "stream": False, } assert response.id == "chat-3d11cf95eb224966937b216d9494fe73" assert len(response.choices) == 1 assert ( response.choices[0].message.content == " Sure, let me check that for you." ) assert response.choices[0].message.tool_calls[0].function.name == "get_weather" assert ( response.choices[0].message.tool_calls[0].function.arguments == '{"location": "San Francisco"}' ) assert response.usage.prompt_tokens == 158 assert response.usage.completion_tokens == 36 assert response.usage.total_tokens == 194 print(f"response: {response}") def test_gemini_function_call_parameter_in_messages(): litellm.set_verbose = True load_vertex_ai_credentials() from litellm.llms.custom_httpx.http_handler import HTTPHandler tools = [ { "type": "function", "function": { "name": "search", "description": "Executes searches.", "parameters": { "type": "object", "properties": { "queries": { "type": "array", "description": "A list of queries to search for.", "items": {"type": "string"}, }, }, "required": ["queries"], }, }, }, ] # Set up the messages messages = [ {"role": "system", "content": """Use search for most queries."""}, {"role": "user", "content": """search for weather in boston (use `search`)"""}, { "role": "assistant", "content": None, "function_call": { "name": "search", "arguments": '{"queries": ["weather in boston"]}', }, }, { "role": "function", "name": "search", "content": "The current weather in Boston is 22°F.", }, ] client = HTTPHandler(concurrent_limit=1) with patch.object(client, "post", new=MagicMock()) as mock_client: try: response_stream = completion( model="vertex_ai/gemini-1.5-pro", messages=messages, tools=tools, tool_choice="auto", client=client, ) except Exception as e: print(e) # mock_client.assert_any_call() assert { "contents": [ { "role": "user", "parts": [{"text": "search for weather in boston (use `search`)"}], }, { "role": "model", "parts": [ { "function_call": { "name": "search", "args": {"queries": ["weather in boston"]}, } } ], }, { "parts": [ { "function_response": { "name": "search", "response": { "content": "The current weather in Boston is 22°F." }, } } ] }, ], "system_instruction": {"parts": [{"text": "Use search for most queries."}]}, "tools": [ { "function_declarations": [ { "name": "search", "description": "Executes searches.", "parameters": { "type": "object", "properties": { "queries": { "type": "array", "description": "A list of queries to search for.", "items": {"type": "string"}, } }, "required": ["queries"], }, } ] } ], "toolConfig": {"functionCallingConfig": {"mode": "AUTO"}}, "generationConfig": {}, } == mock_client.call_args.kwargs["json"] def test_gemini_function_call_parameter_in_messages_2(): litellm.set_verbose = True from litellm.llms.vertex_ai_and_google_ai_studio.gemini.transformation import ( _gemini_convert_messages_with_history, ) messages = [ {"role": "user", "content": "search for weather in boston (use `search`)"}, { "role": "assistant", "content": "Sure, let me check.", "function_call": { "name": "search", "arguments": '{"queries": ["weather in boston"]}', }, }, { "role": "function", "name": "search", "content": "The weather in Boston is 100 degrees.", }, ] returned_contents = _gemini_convert_messages_with_history(messages=messages) print(f"returned_contents: {returned_contents}") assert returned_contents == [ { "role": "user", "parts": [{"text": "search for weather in boston (use `search`)"}], }, { "role": "model", "parts": [ {"text": "Sure, let me check."}, { "function_call": { "name": "search", "args": {"queries": ["weather in boston"]}, } }, ], }, { "parts": [ { "function_response": { "name": "search", "response": { "content": "The weather in Boston is 100 degrees." }, } } ] }, ] @pytest.mark.parametrize( "base_model, metadata", [ (None, {"model_info": {"base_model": "vertex_ai/gemini-1.5-pro"}}), ("vertex_ai/gemini-1.5-pro", None), ], ) def test_gemini_finetuned_endpoint(base_model, metadata): litellm.set_verbose = True load_vertex_ai_credentials() from litellm.llms.custom_httpx.http_handler import HTTPHandler # Set up the messages messages = [ {"role": "system", "content": """Use search for most queries."""}, {"role": "user", "content": """search for weather in boston (use `search`)"""}, ] client = HTTPHandler(concurrent_limit=1) with patch.object(client, "post", new=MagicMock()) as mock_client: try: response = completion( model="vertex_ai/4965075652664360960", messages=messages, tool_choice="auto", client=client, metadata=metadata, base_model=base_model, ) except Exception as e: print(e) print(mock_client.call_args.kwargs) mock_client.assert_called() assert mock_client.call_args.kwargs["url"].endswith( "endpoints/4965075652664360960:generateContent" ) @pytest.mark.parametrize("api_base", ["", None, "my-custom-proxy-base"]) def test_custom_api_base(api_base): stream = None test_endpoint = "my-fake-endpoint" vertex_base = VertexBase() auth_header, url = vertex_base._check_custom_proxy( api_base=api_base, custom_llm_provider="gemini", gemini_api_key="12324", endpoint="", stream=stream, auth_header=None, url="my-fake-endpoint", ) if api_base: assert url == api_base + ":" else: assert url == test_endpoint @pytest.mark.asyncio @pytest.mark.respx async def test_vertexai_embedding_finetuned(respx_mock: MockRouter): """ Tests that: - Request URL and body are correctly formatted for Vertex AI embeddings - Response is properly parsed into litellm's embedding response format """ load_vertex_ai_credentials() litellm.set_verbose = True # Test input input_text = ["good morning from litellm", "this is another item"] # Expected request/response expected_url = "https://us-central1-aiplatform.googleapis.com/v1/projects/633608382793/locations/us-central1/endpoints/1004708436694269952:predict" expected_request = { "instances": [ {"inputs": "good morning from litellm"}, {"inputs": "this is another item"}, ], "parameters": {}, } mock_response = { "predictions": [ [[-0.000431762, -0.04416759, -0.03443353]], # Truncated embedding vector [[-0.000431762, -0.04416759, -0.03443353]], # Truncated embedding vector ], "deployedModelId": "2275167734310371328", "model": "projects/633608382793/locations/us-central1/models/snowflake-arctic-embed-m-long-1731622468876", "modelDisplayName": "snowflake-arctic-embed-m-long-1731622468876", "modelVersionId": "1", } # Setup mock request mock_request = respx_mock.post(expected_url).mock( return_value=httpx.Response(200, json=mock_response) ) # Make request response = await litellm.aembedding( vertex_project="633608382793", model="vertex_ai/1004708436694269952", input=input_text, ) # Assert request was made correctly assert mock_request.called request_body = json.loads(mock_request.calls[0].request.content) print("\n\nrequest_body", request_body) print("\n\nexpected_request", expected_request) assert request_body == expected_request # Assert response structure assert response is not None assert hasattr(response, "data") assert len(response.data) == len(input_text) # Assert embedding structure for embedding in response.data: assert "embedding" in embedding assert isinstance(embedding["embedding"], list) assert len(embedding["embedding"]) > 0 assert all(isinstance(x, float) for x in embedding["embedding"]) @pytest.mark.parametrize("max_retries", [None, 3]) @pytest.mark.asyncio @pytest.mark.respx async def test_vertexai_model_garden_model_completion( respx_mock: MockRouter, max_retries ): """ Relevant issue: https://github.com/BerriAI/litellm/issues/6480 Using OpenAI compatible models from Vertex Model Garden """ load_vertex_ai_credentials() litellm.set_verbose = True # Test input messages = [ { "role": "system", "content": "Your name is Litellm Bot, you are a helpful assistant", }, { "role": "user", "content": "Hello, what is your name and can you tell me the weather?", }, ] # Expected request/response expected_url = "https://us-central1-aiplatform.googleapis.com/v1beta1/projects/633608382793/locations/us-central1/endpoints/5464397967697903616/chat/completions" expected_request = {"model": "", "messages": messages, "stream": False} mock_response = { "id": "chat-09940d4e99e3488aa52a6f5e2ecf35b1", "object": "chat.completion", "created": 1731702782, "model": "meta-llama/Llama-3.1-8B-Instruct", "choices": [ { "index": 0, "message": { "role": "assistant", "content": "Hello, my name is Litellm Bot. I'm a helpful assistant here to provide information and answer your questions.\n\nTo check the weather for you, I'll need to know your location. Could you please provide me with your city or zip code? That way, I can give you the most accurate and up-to-date weather information.\n\nIf you don't have your location handy, I can also suggest some popular weather websites or apps that you can use to check the weather for your area.\n\nLet me know how I can assist you!", "tool_calls": [], }, "logprobs": None, "finish_reason": "stop", "stop_reason": None, } ], "usage": {"prompt_tokens": 63, "total_tokens": 172, "completion_tokens": 109}, "prompt_logprobs": None, } # Setup mock request mock_request = respx_mock.post(expected_url).mock( return_value=httpx.Response(200, json=mock_response) ) # Make request response = await litellm.acompletion( model="vertex_ai/openai/5464397967697903616", messages=messages, vertex_project="633608382793", vertex_location="us-central1", max_retries=max_retries, ) # Assert request was made correctly assert mock_request.called request_body = json.loads(mock_request.calls[0].request.content) assert request_body == expected_request # Assert response structure assert response.id == "chat-09940d4e99e3488aa52a6f5e2ecf35b1" assert response.created == 1731702782 assert response.model == "vertex_ai/meta-llama/Llama-3.1-8B-Instruct" assert len(response.choices) == 1 assert response.choices[0].message.role == "assistant" assert response.choices[0].message.content.startswith( "Hello, my name is Litellm Bot" ) assert response.choices[0].finish_reason == "stop" assert response.usage.completion_tokens == 109 assert response.usage.prompt_tokens == 63 assert response.usage.total_tokens == 172