diff --git a/litellm/__init__.py b/litellm/__init__.py index 57ba28b998..be21654bda 100644 --- a/litellm/__init__.py +++ b/litellm/__init__.py @@ -28,6 +28,7 @@ vertex_project: Optional[str] = None vertex_location: Optional[str] = None togetherai_api_key: Optional[str] = None baseten_key: Optional[str] = None +aleph_alpha_key: Optional[str] = None use_client = False logging = True caching = False # deprecated son @@ -225,6 +226,15 @@ together_ai_models = [ "togethercomputer/llama-2-7b", ] +aleph_alpha_models = [ + "luminous-base", + "luminous-base-control", + "luminous-extended", + "luminous-extended-control", + "luminous-supreme", + "luminous-supreme-control" +] + baseten_models = ["qvv0xeq", "q841o8w", "31dxrj3"] # FALCON 7B # WizardLM # Mosaic ML model_list = ( @@ -240,6 +250,7 @@ model_list = ( + ai21_models + together_ai_models + baseten_models + + aleph_alpha_models ) provider_list = [ diff --git a/litellm/__pycache__/__init__.cpython-311.pyc b/litellm/__pycache__/__init__.cpython-311.pyc index 8740f2ca7b..c2b76dce9e 100644 Binary files a/litellm/__pycache__/__init__.cpython-311.pyc and b/litellm/__pycache__/__init__.cpython-311.pyc differ diff --git a/litellm/__pycache__/main.cpython-311.pyc b/litellm/__pycache__/main.cpython-311.pyc index 5702cb34f2..ac5caf6db9 100644 Binary files a/litellm/__pycache__/main.cpython-311.pyc and b/litellm/__pycache__/main.cpython-311.pyc differ diff --git a/litellm/__pycache__/utils.cpython-311.pyc b/litellm/__pycache__/utils.cpython-311.pyc index 6e56ae6b94..a04749ebcd 100644 Binary files a/litellm/__pycache__/utils.cpython-311.pyc and b/litellm/__pycache__/utils.cpython-311.pyc differ diff --git a/litellm/llms/aleph_alpha.py b/litellm/llms/aleph_alpha.py new file mode 100644 index 0000000000..c513cb105a --- /dev/null +++ b/litellm/llms/aleph_alpha.py @@ -0,0 +1,138 @@ +import os, json +from enum import Enum +import requests +import time +from typing import Callable +from litellm.utils import ModelResponse + +class AlephAlphaError(Exception): + def __init__(self, status_code, message): + self.status_code = status_code + self.message = message + super().__init__( + self.message + ) # Call the base class constructor with the parameters it needs + + +class AlephAlphaLLM: + def __init__( + self, encoding, default_max_tokens_to_sample, logging_obj, api_key=None + ): + self.encoding = encoding + self.default_max_tokens_to_sample = default_max_tokens_to_sample + self.completion_url = "https://api.aleph-alpha.com/complete" + self.api_key = api_key + self.logging_obj = logging_obj + self.validate_environment(api_key=api_key) + + def validate_environment( + self, api_key + ): # set up the environment required to run the model + # set the api key + if self.api_key == None: + raise ValueError( + "Missing Aleph Alpha API Key - A call is being made to Aleph Alpha but no key is set either in the environment variables or via params" + ) + self.api_key = api_key + self.headers = { + "accept": "application/json", + "content-type": "application/json", + "Authorization": "Bearer " + self.api_key, + } + + def completion( + self, + model: str, + messages: list, + model_response: ModelResponse, + print_verbose: Callable, + optional_params=None, + litellm_params=None, + logger_fn=None, + ): # logic for parsing in - calling - parsing out model completion calls + model = model + prompt = "" + if "control" in model: # follow the ###Instruction / ###Response format + for idx, message in enumerate(messages): + if "role" in message: + if idx == 0: # set first message as instruction (required), let later user messages be input + prompt += f"###Instruction: {message['content']}" + else: + if message["role"] == "system": + prompt += ( + f"###Instruction: {message['content']}" + ) + elif message["role"] == "user": + prompt += ( + f"###Input: {message['content']}" + ) + else: + prompt += ( + f"###Response: {message['content']}" + ) + else: + prompt += f"{message['content']}" + else: + prompt = " ".join(message["content"] for message in messages) + data = { + "model": model, + "prompt": prompt, + "maximum_tokens": optional_params["maximum_tokens"] if "maximum_tokens" in optional_params else self.default_max_tokens_to_sample, # required input + **optional_params, + } + + ## LOGGING + self.logging_obj.pre_call( + input=prompt, + api_key=self.api_key, + additional_args={"complete_input_dict": data}, + ) + ## COMPLETION CALL + response = requests.post( + self.completion_url, headers=self.headers, data=json.dumps(data), stream=optional_params["stream"] if "stream" in optional_params else False + ) + if "stream" in optional_params and optional_params["stream"] == True: + return response.iter_lines() + else: + ## LOGGING + self.logging_obj.post_call( + input=prompt, + api_key=self.api_key, + original_response=response.text, + additional_args={"complete_input_dict": data}, + ) + print_verbose(f"raw model_response: {response.text}") + ## RESPONSE OBJECT + completion_response = response.json() + if "error" in completion_response: + raise AlephAlphaError( + message=completion_response["error"], + status_code=response.status_code, + ) + else: + try: + model_response["choices"][0]["message"]["content"] = completion_response["completions"][0]["completion"] + except: + raise AlephAlphaError(message=json.dumps(completion_response), status_code=response.status_code) + + ## CALCULATING USAGE - baseten charges on time, not tokens - have some mapping of cost here. + prompt_tokens = len( + self.encoding.encode(prompt) + ) + completion_tokens = len( + self.encoding.encode(model_response["choices"][0]["message"]["content"]) + ) + + model_response["created"] = time.time() + model_response["model"] = model + model_response["usage"] = { + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "total_tokens": prompt_tokens + completion_tokens, + } + return model_response + + def embedding( + self, + ): # logic for parsing in - calling - parsing out model embedding calls + pass diff --git a/litellm/main.py b/litellm/main.py index c6c03729b0..1731ed548a 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -24,6 +24,7 @@ from .llms.huggingface_restapi import HuggingfaceRestAPILLM from .llms.baseten import BasetenLLM from .llms.ai21 import AI21LLM from .llms.together_ai import TogetherAILLM +from .llms.aleph_alpha import AlephAlphaLLM import tiktoken from concurrent.futures import ThreadPoolExecutor @@ -428,6 +429,33 @@ def completion( litellm_params=litellm_params, logger_fn=logger_fn, ) + if "stream" in optional_params and optional_params["stream"] == True: + # don't try to access stream object, + response = CustomStreamWrapper(model_response, model, logging_obj=logging) + return response + response = model_response + elif model in litellm.aleph_alpha_models: + aleph_alpha_key = ( + api_key or litellm.aleph_alpha_key or os.environ.get("ALEPH_ALPHA_API_KEY") + ) + + aleph_alpha_client = AlephAlphaLLM( + encoding=encoding, + default_max_tokens_to_sample=litellm.max_tokens, + api_key=aleph_alpha_key, + logging_obj=logging # model call logging done inside the class as we make need to modify I/O to fit aleph alpha's requirements + ) + + model_response = aleph_alpha_client.completion( + model=model, + messages=messages, + model_response=model_response, + print_verbose=print_verbose, + optional_params=optional_params, + litellm_params=litellm_params, + logger_fn=logger_fn, + ) + if "stream" in optional_params and optional_params["stream"] == True: # don't try to access stream object, response = CustomStreamWrapper(model_response, model, logging_obj=logging) diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py index 7fd5b34f62..3b8bdc8834 100644 --- a/litellm/tests/test_completion.py +++ b/litellm/tests/test_completion.py @@ -54,6 +54,27 @@ def test_completion_claude(): except Exception as e: pytest.fail(f"Error occurred: {e}") +# def test_completion_aleph_alpha(): +# try: +# response = completion( +# model="luminous-base", messages=messages, logger_fn=logger_fn +# ) +# # Add any assertions here to check the response +# print(response) +# except Exception as e: +# pytest.fail(f"Error occurred: {e}") + + +# def test_completion_aleph_alpha_control_models(): +# try: +# response = completion( +# model="luminous-base-control", messages=messages, logger_fn=logger_fn +# ) +# # Add any assertions here to check the response +# print(response) +# except Exception as e: +# pytest.fail(f"Error occurred: {e}") + def test_completion_with_litellm_call_id(): try: litellm.use_client = False diff --git a/litellm/tests/test_streaming.py b/litellm/tests/test_streaming.py index 047fc45537..90828d20cb 100644 --- a/litellm/tests/test_streaming.py +++ b/litellm/tests/test_streaming.py @@ -45,59 +45,62 @@ messages = [{"content": user_message, "role": "user"}] # pass # test on openai completion call -try: - response = completion( - model="text-davinci-003", messages=messages, stream=True, logger_fn=logger_fn - ) - complete_response = "" - start_time = time.time() - for chunk in response: - chunk_time = time.time() - print(f"chunk: {chunk}") - complete_response += chunk["choices"][0]["delta"]["content"] - if complete_response == "": - raise Exception("Empty response received") -except: - print(f"error occurred: {traceback.format_exc()}") - pass +def test_openai_text_completion_call(): + try: + response = completion( + model="text-davinci-003", messages=messages, stream=True, logger_fn=logger_fn + ) + complete_response = "" + start_time = time.time() + for chunk in response: + chunk_time = time.time() + print(f"chunk: {chunk}") + complete_response += chunk["choices"][0]["delta"]["content"] + if complete_response == "": + raise Exception("Empty response received") + except: + print(f"error occurred: {traceback.format_exc()}") + pass # # test on ai21 completion call -try: - response = completion( - model="j2-ultra", messages=messages, stream=True, logger_fn=logger_fn - ) - print(f"response: {response}") - complete_response = "" - start_time = time.time() - for chunk in response: - chunk_time = time.time() - print(f"time since initial request: {chunk_time - start_time:.5f}") - print(chunk["choices"][0]["delta"]) - complete_response += chunk["choices"][0]["delta"]["content"] - if complete_response == "": - raise Exception("Empty response received") -except: - print(f"error occurred: {traceback.format_exc()}") - pass +def ai21_completion_call(): + try: + response = completion( + model="j2-ultra", messages=messages, stream=True, logger_fn=logger_fn + ) + print(f"response: {response}") + complete_response = "" + start_time = time.time() + for chunk in response: + chunk_time = time.time() + print(f"time since initial request: {chunk_time - start_time:.5f}") + print(chunk["choices"][0]["delta"]) + complete_response += chunk["choices"][0]["delta"]["content"] + if complete_response == "": + raise Exception("Empty response received") + except: + print(f"error occurred: {traceback.format_exc()}") + pass # test on openai completion call -try: - response = completion( - model="gpt-3.5-turbo", messages=messages, stream=True, logger_fn=logger_fn - ) - complete_response = "" - start_time = time.time() - for chunk in response: - chunk_time = time.time() - print(f"time since initial request: {chunk_time - start_time:.5f}") - print(chunk["choices"][0]["delta"]) - complete_response += chunk["choices"][0]["delta"]["content"] - if complete_response == "": - raise Exception("Empty response received") -except: - print(f"error occurred: {traceback.format_exc()}") - pass +def test_openai_chat_completion_call(): + try: + response = completion( + model="gpt-3.5-turbo", messages=messages, stream=True, logger_fn=logger_fn + ) + complete_response = "" + start_time = time.time() + for chunk in response: + chunk_time = time.time() + print(f"time since initial request: {chunk_time - start_time:.5f}") + print(chunk["choices"][0]["delta"]) + complete_response += chunk["choices"][0]["delta"]["content"] + if complete_response == "": + raise Exception("Empty response received") + except: + print(f"error occurred: {traceback.format_exc()}") + pass # # test on azure completion call @@ -138,50 +141,79 @@ except: # pass # test on together ai completion call - replit-code-3b -try: - start_time = time.time() - response = completion( - model="Replit-Code-3B", messages=messages, logger_fn=logger_fn, stream=True - ) - complete_response = "" - print(f"returned response object: {response}") - for chunk in response: - chunk_time = time.time() - print(f"time since initial request: {chunk_time - start_time:.2f}") - print(chunk["choices"][0]["delta"]) - complete_response += ( - chunk["choices"][0]["delta"]["content"] - if len(chunk["choices"][0]["delta"].keys()) > 0 - else "" +def test_together_ai_completion_call_replit(): + try: + start_time = time.time() + response = completion( + model="Replit-Code-3B", messages=messages, logger_fn=logger_fn, stream=True ) - if complete_response == "": - raise Exception("Empty response received") -except: - print(f"error occurred: {traceback.format_exc()}") - pass + complete_response = "" + print(f"returned response object: {response}") + for chunk in response: + chunk_time = time.time() + print(f"time since initial request: {chunk_time - start_time:.2f}") + print(chunk["choices"][0]["delta"]) + complete_response += ( + chunk["choices"][0]["delta"]["content"] + if len(chunk["choices"][0]["delta"].keys()) > 0 + else "" + ) + if complete_response == "": + raise Exception("Empty response received") + except: + print(f"error occurred: {traceback.format_exc()}") + pass # # test on together ai completion call - starcoder -try: - start_time = time.time() - response = completion( - model="together_ai/bigcode/starcoder", - messages=messages, - logger_fn=logger_fn, - stream=True, - ) - complete_response = "" - print(f"returned response object: {response}") - for chunk in response: - chunk_time = time.time() - complete_response += ( - chunk["choices"][0]["delta"]["content"] - if len(chunk["choices"][0]["delta"].keys()) > 0 - else "" +def test_together_ai_completion_call_starcoder(): + try: + start_time = time.time() + response = completion( + model="together_ai/bigcode/starcoder", + messages=messages, + logger_fn=logger_fn, + stream=True, ) - if len(complete_response) > 0: - print(complete_response) - if complete_response == "": - raise Exception("Empty response received") -except: - print(f"error occurred: {traceback.format_exc()}") - pass + complete_response = "" + print(f"returned response object: {response}") + for chunk in response: + chunk_time = time.time() + complete_response += ( + chunk["choices"][0]["delta"]["content"] + if len(chunk["choices"][0]["delta"].keys()) > 0 + else "" + ) + if len(complete_response) > 0: + print(complete_response) + if complete_response == "": + raise Exception("Empty response received") + except: + print(f"error occurred: {traceback.format_exc()}") + pass + +# test on aleph alpha completion call +# def test_aleph_alpha_call(): +# try: +# start_time = time.time() +# response = completion( +# model="luminous-base", +# messages=messages, +# logger_fn=logger_fn, +# stream=True, +# ) +# complete_response = "" +# print(f"returned response object: {response}") +# for chunk in response: +# chunk_time = time.time() +# complete_response += ( +# chunk["choices"][0]["delta"]["content"] +# if len(chunk["choices"][0]["delta"].keys()) > 0 +# else "" +# ) +# if len(complete_response) > 0: +# print(complete_response) +# if complete_response == "": +# raise Exception("Empty response received") +# except: +# print(f"error occurred: {traceback.format_exc()}") +# pass diff --git a/litellm/utils.py b/litellm/utils.py index d10a742fdf..7c3eab50ca 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -780,6 +780,25 @@ def get_optional_params( # use the openai defaults if presence_penalty != 0: optional_params["repetition_penalty"] = presence_penalty optional_params["details"] = True + elif model in litellm.aleph_alpha_models: + if max_tokens != float("inf"): + optional_params["maximum_tokens"] = max_tokens + if stream: + optional_params["stream"] = stream + if temperature != 1: + optional_params["temperature"] = temperature + if top_k != 40: + optional_params["top_k"] = top_k + if top_p != 1: + optional_params["top_p"] = top_p + if presence_penalty != 0: + optional_params["presence_penalty"] = presence_penalty + if frequency_penalty != 0: + optional_params["frequency_penalty"] = frequency_penalty + if n != 1: + optional_params["n"] = n + if stop != None: + optional_params["stop_sequences"] = stop else: # assume passing in params for openai/azure openai if functions != []: optional_params["functions"] = functions @@ -1766,6 +1785,14 @@ class CustomStreamWrapper: except: raise ValueError(f"Unable to parse response. Original response: {chunk}") + def handle_aleph_alpha_chunk(self, chunk): + chunk = chunk.decode("utf-8") + data_json = json.loads(chunk) + try: + return data_json["completions"][0]["completion"] + except: + raise ValueError(f"Unable to parse response. Original response: {chunk}") + def handle_openai_text_completion_chunk(self, chunk): try: return chunk["choices"][0]["text"] @@ -1832,6 +1859,9 @@ class CustomStreamWrapper: elif self.custom_llm_provider and self.custom_llm_provider == "ai21": #ai21 doesn't provide streaming chunk = next(self.completion_stream) completion_obj["content"] = self.handle_ai21_chunk(chunk) + elif self.model in litellm.aleph_alpha_models: #ai21 doesn't provide streaming + chunk = next(self.completion_stream) + completion_obj["content"] = self.handle_aleph_alpha_chunk(chunk) elif self.model in litellm.open_ai_text_completion_models: chunk = next(self.completion_stream) completion_obj["content"] = self.handle_openai_text_completion_chunk(chunk)