diff --git a/litellm/__init__.py b/litellm/__init__.py index ec3ea31651..882994184e 100644 --- a/litellm/__init__.py +++ b/litellm/__init__.py @@ -29,6 +29,7 @@ vertex_location: Optional[str] = None togetherai_api_key: Optional[str] = None baseten_key: Optional[str] = None aleph_alpha_key: Optional[str] = None +nlp_cloud_key: Optional[str] = None use_client = False logging = True caching = False # deprecated son @@ -152,6 +153,8 @@ huggingface_models = [ ai21_models = ["j2-ultra", "j2-mid", "j2-light"] +nlp_cloud_models = ["dolphin", "chatdolphin"] + together_ai_models = [ "togethercomputer/llama-2-70b-chat", "togethercomputer/Llama-2-7B-32K-Instruct", @@ -183,6 +186,7 @@ model_list = ( + together_ai_models + baseten_models + aleph_alpha_models + + nlp_cloud_models ) provider_list = [ @@ -200,6 +204,7 @@ provider_list = [ "sagemaker", "bedrock", "vllm", + "nlp_cloud", "custom", # custom apis ] diff --git a/litellm/__pycache__/__init__.cpython-311.pyc b/litellm/__pycache__/__init__.cpython-311.pyc index b5b19892ba..f687fe5eca 100644 Binary files a/litellm/__pycache__/__init__.cpython-311.pyc and b/litellm/__pycache__/__init__.cpython-311.pyc differ diff --git a/litellm/__pycache__/main.cpython-311.pyc b/litellm/__pycache__/main.cpython-311.pyc index 7d1b0da0d7..57f2e449a6 100644 Binary files a/litellm/__pycache__/main.cpython-311.pyc and b/litellm/__pycache__/main.cpython-311.pyc differ diff --git a/litellm/__pycache__/utils.cpython-311.pyc b/litellm/__pycache__/utils.cpython-311.pyc index 1993162724..0062150af5 100644 Binary files a/litellm/__pycache__/utils.cpython-311.pyc and b/litellm/__pycache__/utils.cpython-311.pyc differ diff --git a/litellm/llms/nlp_cloud.py b/litellm/llms/nlp_cloud.py new file mode 100644 index 0000000000..8dd2e02309 --- /dev/null +++ b/litellm/llms/nlp_cloud.py @@ -0,0 +1,103 @@ +import os +import json +from enum import Enum +import requests +import time +from typing import Callable +from litellm.utils import ModelResponse + +class NLPCloudError(Exception): + def __init__(self, status_code, message): + self.status_code = status_code + self.message = message + super().__init__( + self.message + ) # Call the base class constructor with the parameters it needs + +def validate_environment(api_key): + headers = { + "accept": "application/json", + "content-type": "application/json", + } + if api_key: + headers["Authorization"] = f"Token {api_key}" + return headers + +def completion( + model: str, + messages: list, + model_response: ModelResponse, + print_verbose: Callable, + encoding, + api_key, + logging_obj, + optional_params=None, + litellm_params=None, + logger_fn=None, + default_max_tokens_to_sample=None, +): + headers = validate_environment(api_key) + completion_url_fragment_1 = "https://api.nlpcloud.io/v1/gpu/" + completion_url_fragment_2 = "/generation" + model = model + text = " ".join(message["content"] for message in messages) + + data = { + "text": text, + **optional_params, + } + + completion_url = completion_url_fragment_1 + model + completion_url_fragment_2 + ## LOGGING + logging_obj.pre_call( + input=text, + api_key=api_key, + additional_args={"complete_input_dict": data}, + ) + ## COMPLETION CALL + response = requests.post( + completion_url, headers=headers, data=json.dumps(data), stream=optional_params["stream"] if "stream" in optional_params else False + ) + if "stream" in optional_params and optional_params["stream"] == True: + return response.iter_lines() + else: + ## LOGGING + logging_obj.post_call( + input=text, + api_key=api_key, + original_response=response.text, + additional_args={"complete_input_dict": data}, + ) + print_verbose(f"raw model_response: {response.text}") + ## RESPONSE OBJECT + try: + completion_response = response.json() + except: + raise NLPCloudError(message=response.text, status_code=response.status_code) + if "error" in completion_response: + raise NLPCloudError( + message=completion_response["error"], + status_code=response.status_code, + ) + else: + try: + model_response["choices"][0]["message"]["content"] = completion_response["generated_text"] + except: + raise NLPCloudError(message=json.dumps(completion_response), status_code=response.status_code) + + ## CALCULATING USAGE - baseten charges on time, not tokens - have some mapping of cost here. + prompt_tokens = completion_response["nb_input_tokens"] + completion_tokens = completion_response["nb_generated_tokens"] + + model_response["created"] = time.time() + model_response["model"] = model + model_response["usage"] = { + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "total_tokens": prompt_tokens + completion_tokens, + } + return model_response + +def embedding(): + # logic for parsing in - calling - parsing out model embedding calls + pass diff --git a/litellm/main.py b/litellm/main.py index 243dd891e2..476f83da9b 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -27,6 +27,7 @@ from .llms import bedrock from .llms import huggingface_restapi from .llms import replicate from .llms import aleph_alpha +from .llms import nlp_cloud from .llms import baseten from .llms import vllm from .llms import ollama @@ -111,6 +112,7 @@ def completion( # model specific optional params top_k=40,# used by text-bison only task: Optional[str]="text-generation-inference", # used by huggingface inference endpoints + remove_input: bool = True, # used by nlp cloud models - prevents input text from being returned as part of output request_timeout=0, # unused var for old version of OpenAI API fallbacks=[], caching = False, @@ -154,7 +156,8 @@ def completion( model=model, custom_llm_provider=custom_llm_provider, top_k=top_k, - task=task + task=task, + remove_input=remove_input ) # For logging - save the values of the litellm-specific params passed in litellm_params = get_litellm_params( @@ -421,6 +424,29 @@ def completion( response = CustomStreamWrapper(model_response, model, logging_obj=logging) return response response = model_response + elif model in litellm.nlp_cloud_models or custom_llm_provider == "nlp_cloud": + nlp_cloud_key = ( + api_key or litellm.nlp_cloud_key or get_secret("NLP_CLOUD_API_KEY") or litellm.api_key + ) + + model_response = nlp_cloud.completion( + model=model, + messages=messages, + model_response=model_response, + print_verbose=print_verbose, + optional_params=optional_params, + litellm_params=litellm_params, + logger_fn=logger_fn, + encoding=encoding, + api_key=nlp_cloud_key, + logging_obj=logging + ) + + if "stream" in optional_params and optional_params["stream"] == True: + # don't try to access stream object, + response = CustomStreamWrapper(model_response, model, custom_llm_provider="nlp_cloud", logging_obj=logging) + return response + response = model_response elif model in litellm.aleph_alpha_models: aleph_alpha_key = ( api_key or litellm.aleph_alpha_key or get_secret("ALEPH_ALPHA_API_KEY") or get_secret("ALEPHALPHA_API_KEY") or litellm.api_key diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py index 84336eb88f..98faeb5c2c 100644 --- a/litellm/tests/test_completion.py +++ b/litellm/tests/test_completion.py @@ -113,8 +113,38 @@ def test_completion_claude_stream(): pytest.fail(f"Error occurred: {e}") # test_completion_claude_stream() +def test_completion_nlp_cloud(): + try: + messages = [ + {"role": "system", "content": "You are a helpful assistant."}, + { + "role": "user", + "content": "how does a court case get to the Supreme Court?", + }, + ] + response = completion(model="dolphin", messages=messages, logger_fn=logger_fn) + print(response) + except Exception as e: + pytest.fail(f"Error occurred: {e}") +def test_completion_nlp_cloud_streaming(): + try: + messages = [ + {"role": "system", "content": "You are a helpful assistant."}, + { + "role": "user", + "content": "how does a court case get to the Supreme Court?", + }, + ] + response = completion(model="dolphin", messages=messages, stream=True, logger_fn=logger_fn) + # Add any assertions here to check the response + for chunk in response: + print(chunk["choices"][0]["delta"]["content"]) # same as openai format + print(chunk["choices"][0]["finish_reason"]) + except Exception as e: + pytest.fail(f"Error occurred: {e}") +test_completion_nlp_cloud_streaming() # def test_completion_hf_api(): # try: # user_message = "write some code to find the sum of two numbers" diff --git a/litellm/tests/test_exceptions.py b/litellm/tests/test_exceptions.py index f08705e41c..6320b7131d 100644 --- a/litellm/tests/test_exceptions.py +++ b/litellm/tests/test_exceptions.py @@ -75,6 +75,9 @@ def invalid_auth(model): # set the model key to an invalid key, depending on th elif model in litellm.aleph_alpha_models: temporary_key = os.environ["ALEPH_ALPHA_API_KEY"] os.environ["ALEPH_ALPHA_API_KEY"] = "bad-key" + elif model in litellm.nlp_cloud_models: + temporary_key = os.environ["NLP_CLOUD_API_KEY"] + os.environ["NLP_CLOUD_API_KEY"] = "bad-key" elif ( model == "replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1" @@ -119,6 +122,8 @@ def invalid_auth(model): # set the model key to an invalid key, depending on th os.environ["TOGETHERAI_API_KEY"] = temporary_key elif model in litellm.aleph_alpha_models: os.environ["ALEPH_ALPHA_API_KEY"] = temporary_key + elif model in litellm.nlp_cloud_models: + os.environ["NLP_CLOUD_API_KEY"] = temporary_key return # Test 3: Invalid Request Error diff --git a/litellm/utils.py b/litellm/utils.py index c5f35cfde4..ff7a8588c5 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -783,6 +783,7 @@ def get_optional_params( # use the openai defaults frequency_penalty=0, logit_bias={}, num_beams=1, + remove_input=False, # for nlp_cloud user="", deployment_id=None, model=None, @@ -917,6 +918,29 @@ def get_optional_params( # use the openai defaults optional_params["n"] = n if stop != None: optional_params["stop_sequences"] = stop + elif model in litellm.nlp_cloud_models or custom_llm_provider == "nlp_cloud": + if max_tokens != float("inf"): + optional_params["max_length"] = max_tokens + if stream: + optional_params["stream"] = stream + if temperature != 1: + optional_params["temperature"] = temperature + if top_k != 40: + optional_params["top_k"] = top_k + if top_p != 1: + optional_params["top_p"] = top_p + if presence_penalty != 0: + optional_params["presence_penalty"] = presence_penalty + if frequency_penalty != 0: + optional_params["frequency_penalty"] = frequency_penalty + if num_beams != 1: + optional_params["num_beams"] = num_beams + if n != 1: + optional_params["num_return_sequences"] = n + if remove_input == True: + optional_params["remove_input"] = True + if stop != None: + optional_params["stop_sequences"] = stop else: # assume passing in params for openai/azure openai if functions != []: optional_params["functions"] = functions @@ -993,6 +1017,9 @@ def get_llm_provider(model: str, custom_llm_provider: Optional[str] = None): ## baseten elif model in litellm.baseten_models: custom_llm_provider = "baseten" + ## nlp_cloud + elif model in litellm.nlp_cloud_models: + custom_llm_provider = "nlp_cloud" if custom_llm_provider is None or custom_llm_provider=="": raise ValueError(f"LLM Provider NOT provided. Pass in the LLM provider you are trying to call. E.g. For 'Huggingface' inference endpoints pass in `completion(model='huggingface/{model}',..)` Learn more: https://docs.litellm.ai/docs/providers") @@ -1968,6 +1995,81 @@ def exception_type(model, original_exception, custom_llm_provider): llm_provider="ai21", model=model ) + elif model in litellm.nlp_cloud_models or custom_llm_provider == "nlp_cloud": + if "detail" in error_str: + if "Input text length should not exceed" in error_str: + exception_mapping_worked = True + raise ContextWindowExceededError( + message=f"NLPCloudException - {error_str}", + model=model, + llm_provider="nlp_cloud" + ) + elif "value is not a valid" in error_str: + exception_mapping_worked = True + raise InvalidRequestError( + message=f"NLPCloudException - {error_str}", + model=model, + llm_provider="nlp_cloud" + ) + else: + exception_mapping_worked = True + raise APIError( + status_code=500, + message=f"NLPCloudException - {error_str}", + model=model, + llm_provider="nlp_cloud" + ) + if hasattr(original_exception, "status_code"): # https://docs.nlpcloud.com/?shell#errors + if original_exception.status_code == 400 or original_exception.status_code == 406 or original_exception.status_code == 413 or original_exception.status_code == 422: + exception_mapping_worked = True + raise InvalidRequestError( + message=f"NLPCloudException - {original_exception.message}", + llm_provider="nlp_cloud", + model=model + ) + elif original_exception.status_code == 401 or original_exception.status_code == 403: + exception_mapping_worked = True + raise AuthenticationError( + message=f"NLPCloudException - {original_exception.message}", + llm_provider="nlp_cloud", + model=model + ) + elif original_exception.status_code == 522 or original_exception.status_code == 524: + exception_mapping_worked = True + raise Timeout( + message=f"NLPCloudException - {original_exception.message}", + model=model, + llm_provider="nlp_cloud" + ) + elif original_exception.status_code == 429 or original_exception.status_code == 402: + exception_mapping_worked = True + raise RateLimitError( + message=f"NLPCloudException - {original_exception.message}", + llm_provider="nlp_cloud", + ) + elif original_exception.status_code == 500 or original_exception.status_code == 503: + exception_mapping_worked = True + raise APIError( + status_code=original_exception.status_code, + message=f"NLPCloudException - {original_exception.message}", + llm_provider="nlp_cloud", + model=model + ) + elif original_exception.status_code == 504 or original_exception.status_code == 520: + exception_mapping_worked = True + raise ServiceUnavailableError( + message=f"NLPCloudException - {original_exception.message}", + model=model, + llm_provider="nlp_cloud" + ) + else: + exception_mapping_worked = True + raise APIError( + status_code=original_exception.status_code, + message=f"NLPCloudException - {original_exception.message}", + llm_provider="nlp_cloud", + model=model + ) elif custom_llm_provider == "together_ai": error_response = json.loads(error_str) if "error" in error_response and "`inputs` tokens + `max_new_tokens` must be <=" in error_response["error"]: @@ -2240,6 +2342,15 @@ class CustomStreamWrapper: except: raise ValueError(f"Unable to parse response. Original response: {chunk}") + def handle_nlp_cloud_chunk(self, chunk): + chunk = chunk.decode("utf-8") + data_json = json.loads(chunk) + try: + print(f"data json: {data_json}") + return data_json["generated_text"] + except: + raise ValueError(f"Unable to parse response. Original response: {chunk}") + def handle_aleph_alpha_chunk(self, chunk): chunk = chunk.decode("utf-8") data_json = json.loads(chunk) @@ -2320,12 +2431,15 @@ class CustomStreamWrapper: elif self.custom_llm_provider and self.custom_llm_provider == "vllm": chunk = next(self.completion_stream) completion_obj["content"] = chunk[0].outputs[0].text - elif self.model in litellm.aleph_alpha_models: #ai21 doesn't provide streaming + elif self.model in litellm.aleph_alpha_models: #aleph alpha doesn't provide streaming chunk = next(self.completion_stream) completion_obj["content"] = self.handle_aleph_alpha_chunk(chunk) elif self.model in litellm.open_ai_text_completion_models: chunk = next(self.completion_stream) completion_obj["content"] = self.handle_openai_text_completion_chunk(chunk) + elif self.model in litellm.nlp_cloud_models or self.custom_llm_provider == "nlp_cloud": + chunk = next(self.completion_stream) + completion_obj["content"] = self.handle_nlp_cloud_chunk(chunk) else: # openai chat/azure models chunk = next(self.completion_stream) return chunk # open ai returns finish_reason, we should just return the openai chunk diff --git a/pyproject.toml b/pyproject.toml index e543b97bd6..17e65d90b7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "litellm" -version = "0.1.623" +version = "0.1.624" description = "Library to easily interface with LLM API providers" authors = ["BerriAI"] license = "MIT License"