diff --git a/dist/litellm-1.14.0.dev1-py3-none-any.whl b/dist/litellm-1.14.0.dev1-py3-none-any.whl new file mode 100644 index 000000000..7428d252d Binary files /dev/null and b/dist/litellm-1.14.0.dev1-py3-none-any.whl differ diff --git a/dist/litellm-1.14.0.dev1.tar.gz b/dist/litellm-1.14.0.dev1.tar.gz new file mode 100644 index 000000000..cd80b0e71 Binary files /dev/null and b/dist/litellm-1.14.0.dev1.tar.gz differ diff --git a/litellm/llms/vertex_ai.py b/litellm/llms/vertex_ai.py index 3d814f22e..b7bc7935a 100644 --- a/litellm/llms/vertex_ai.py +++ b/litellm/llms/vertex_ai.py @@ -69,6 +69,7 @@ def completion( optional_params=None, litellm_params=None, logger_fn=None, + acompletion: bool=False ): try: import vertexai @@ -77,7 +78,7 @@ def completion( try: from vertexai.preview.language_models import ChatModel, CodeChatModel, InputOutputTextPair from vertexai.language_models import TextGenerationModel, CodeGenerationModel - from vertexai.preview.generative_models import GenerativeModel, Part + from vertexai.preview.generative_models import GenerativeModel, Part, GenerationConfig vertexai.init( @@ -99,13 +100,13 @@ def completion( request_str = "" response_obj = None if model in litellm.vertex_language_models: - chat_model = GenerativeModel(model) + llm_model = GenerativeModel(model) mode = "" - request_str += f"chat_model = GenerativeModel({model})\n" + request_str += f"llm_model = GenerativeModel({model})\n" elif model in litellm.vertex_chat_models: - chat_model = ChatModel.from_pretrained(model) + llm_model = ChatModel.from_pretrained(model) mode = "chat" - request_str += f"chat_model = ChatModel.from_pretrained({model})\n" + request_str += f"llm_model = ChatModel.from_pretrained({model})\n" elif model in litellm.vertex_text_models: text_model = TextGenerationModel.from_pretrained(model) mode = "text" @@ -114,34 +115,38 @@ def completion( text_model = CodeGenerationModel.from_pretrained(model) mode = "text" request_str += f"text_model = CodeGenerationModel.from_pretrained({model})\n" - else: # vertex_code_chat_models - chat_model = CodeChatModel.from_pretrained(model) + else: # vertex_code_llm_models + llm_model = CodeChatModel.from_pretrained(model) mode = "chat" - request_str += f"chat_model = CodeChatModel.from_pretrained({model})\n" + request_str += f"llm_model = CodeChatModel.from_pretrained({model})\n" + if acompletion == True and model in litellm.vertex_language_models: # [TODO] expand support to vertex ai chat + text models + if optional_params.get("stream", False) is True: + # async streaming + pass + return async_completion(llm_model=llm_model, mode=mode, prompt=prompt, logging_obj=logging_obj, request_str=request_str, model=model, model_response=model_response, **optional_params) + if mode == "": - chat = chat_model.start_chat() - request_str+= f"chat = chat_model.start_chat()\n" + chat = llm_model.start_chat() + request_str+= f"chat = llm_model.start_chat()\n" if "stream" in optional_params and optional_params["stream"] == True: request_str += f"chat.send_message_streaming({prompt}, **{optional_params})\n" ## LOGGING logging_obj.pre_call(input=prompt, api_key=None, additional_args={"complete_input_dict": optional_params, "request_str": request_str}) - model_response = chat.send_message(prompt, **optional_params) + model_response = chat.send_message(prompt, generation_config=GenerationConfig(**optional_params)) optional_params["stream"] = True return model_response request_str += f"chat.send_message({prompt}, **{optional_params}).text\n" + ## LOGGING logging_obj.pre_call(input=prompt, api_key=None, additional_args={"complete_input_dict": optional_params, "request_str": request_str}) - response_obj = chat.send_message(prompt, **optional_params) + response_obj = chat.send_message(prompt, generation_config=GenerationConfig(**optional_params)) completion_response = response_obj.text response_obj = response_obj._raw_response elif mode == "chat": - chat = chat_model.start_chat() - request_str+= f"chat = chat_model.start_chat()\n" - - ## LOGGING - + chat = llm_model.start_chat() + request_str+= f"chat = llm_model.start_chat()\n" if "stream" in optional_params and optional_params["stream"] == True: # NOTE: VertexAI does not accept stream=True as a param and raises an error, @@ -149,12 +154,14 @@ def completion( # after we get the response we add optional_params["stream"] = True, since main.py needs to know it's a streaming response to then transform it for the OpenAI format optional_params.pop("stream", None) # vertex ai raises an error when passing stream in optional params request_str += f"chat.send_message_streaming({prompt}, **{optional_params})\n" + ## LOGGING logging_obj.pre_call(input=prompt, api_key=None, additional_args={"complete_input_dict": optional_params, "request_str": request_str}) model_response = chat.send_message_streaming(prompt, **optional_params) optional_params["stream"] = True return model_response request_str += f"chat.send_message({prompt}, **{optional_params}).text\n" + ## LOGGING logging_obj.pre_call(input=prompt, api_key=None, additional_args={"complete_input_dict": optional_params, "request_str": request_str}) completion_response = chat.send_message(prompt, **optional_params).text elif mode == "text": @@ -162,12 +169,14 @@ def completion( if "stream" in optional_params and optional_params["stream"] == True: optional_params.pop("stream", None) # See note above on handling streaming for vertex ai request_str += f"text_model.predict_streaming({prompt}, **{optional_params})\n" + ## LOGGING logging_obj.pre_call(input=prompt, api_key=None, additional_args={"complete_input_dict": optional_params, "request_str": request_str}) model_response = text_model.predict_streaming(prompt, **optional_params) optional_params["stream"] = True return model_response request_str += f"text_model.predict({prompt}, **{optional_params}).text\n" + ## LOGGING logging_obj.pre_call(input=prompt, api_key=None, additional_args={"complete_input_dict": optional_params, "request_str": request_str}) completion_response = text_model.predict(prompt, **optional_params).text @@ -207,6 +216,49 @@ def completion( except Exception as e: raise VertexAIError(status_code=500, message=str(e)) +async def async_completion(llm_model, mode: str, prompt: str, model: str, model_response: ModelResponse, logging_obj=None, request_str=None, **optional_params): + """ + Add support for acompletion calls for gemini-pro + """ + from vertexai.preview.generative_models import GenerationConfig + + if mode == "": + # gemini-pro + llm_model = llm_model.start_chat() + ## LOGGING + logging_obj.pre_call(input=prompt, api_key=None, additional_args={"complete_input_dict": optional_params, "request_str": request_str}) + response_obj = await llm_model.send_message_async(prompt, generation_config=GenerationConfig(**optional_params)) + completion_response = response_obj.text + response_obj = response_obj._raw_response + elif mode == "chat": + # chat-bison etc. + pass + elif mode == "text": + # gecko etc. + pass + + + ## RESPONSE OBJECT + if len(str(completion_response)) > 0: + model_response["choices"][0]["message"][ + "content" + ] = str(completion_response) + model_response["choices"][0]["message"]["content"] = str(completion_response) + model_response["created"] = int(time.time()) + model_response["model"] = model + ## CALCULATING USAGE + if model in litellm.vertex_language_models and response_obj is not None: + model_response["choices"][0].finish_reason = response_obj.candidates[0].finish_reason.name + usage = Usage(prompt_tokens=response_obj.usage_metadata.prompt_token_count, + completion_tokens=response_obj.usage_metadata.candidates_token_count, + total_tokens=response_obj.usage_metadata.total_token_count) + model_response.usage = usage + return model_response + +def async_streaming(): + """ + Add support for async streaming calls for gemini-pro + """ def embedding(): # logic for parsing in - calling - parsing out model embedding calls diff --git a/litellm/main.py b/litellm/main.py index 5138607bd..d8850c7e6 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -177,7 +177,8 @@ async def acompletion(*args, **kwargs): or custom_llm_provider == "perplexity" or custom_llm_provider == "text-completion-openai" or custom_llm_provider == "huggingface" - or custom_llm_provider == "ollama"): # currently implemented aiohttp calls for just azure and openai, soon all. + or custom_llm_provider == "ollama" + or custom_llm_provider == "vertex_ai"): # currently implemented aiohttp calls for just azure and openai, soon all. if kwargs.get("stream", False): response = completion(*args, **kwargs) else: @@ -1152,7 +1153,8 @@ def completion( encoding=encoding, vertex_location=vertex_ai_location, vertex_project=vertex_ai_project, - logging_obj=logging + logging_obj=logging, + acompletion=acompletion ) if "stream" in optional_params and optional_params["stream"] == True: diff --git a/litellm/tests/test_amazing_vertex_completion.py b/litellm/tests/test_amazing_vertex_completion.py index 02531abe8..620962128 100644 --- a/litellm/tests/test_amazing_vertex_completion.py +++ b/litellm/tests/test_amazing_vertex_completion.py @@ -9,15 +9,15 @@ import os, io sys.path.insert( 0, os.path.abspath("../..") ) # Adds the parent directory to the system path -import pytest +import pytest, asyncio import litellm -from litellm import embedding, completion, completion_cost, Timeout +from litellm import embedding, completion, completion_cost, Timeout, acompletion from litellm import RateLimitError import json import os import tempfile -# litellm.num_retries = 3 +litellm.num_retries = 3 litellm.cache = None user_message = "Write a short poem about the sky" messages = [{"content": user_message, "role": "user"}] @@ -73,14 +73,14 @@ def test_vertex_ai(): litellm.vertex_project = "hardy-device-386718" test_models = random.sample(test_models, 4) - test_models += litellm.vertex_language_models # always test gemini-pro + test_models = litellm.vertex_language_models # always test gemini-pro for model in test_models: try: if model in ["code-gecko@001", "code-gecko@latest", "code-bison@001", "text-bison@001"]: # our account does not have access to this model continue print("making request", model) - response = completion(model=model, messages=[{'role': 'user', 'content': 'hi'}]) + response = completion(model=model, messages=[{'role': 'user', 'content': 'hi'}], temperature=0.7) print("\nModel Response", response) print(response) assert type(response.choices[0].message.content) == str @@ -117,3 +117,19 @@ def test_vertex_ai_stream(): except Exception as e: pytest.fail(f"Error occurred: {e}") # test_vertex_ai_stream() + +@pytest.mark.asyncio +async def test_async_vertexai_response(): + load_vertex_ai_credentials() + user_message = "Hello, how are you?" + messages = [{"content": user_message, "role": "user"}] + try: + response = await acompletion(model="gemini-pro", messages=messages, temperature=0.7, timeout=5) + # response = await response + print(f"response: {response}") + except litellm.Timeout as e: + pass + except Exception as e: + pytest.fail(f"An exception occurred: {e}") + +asyncio.run(test_async_vertexai_response()) \ No newline at end of file