diff --git a/litellm/__pycache__/main.cpython-311.pyc b/litellm/__pycache__/main.cpython-311.pyc index 6b42eec17..3b91b0fb3 100644 Binary files a/litellm/__pycache__/main.cpython-311.pyc and b/litellm/__pycache__/main.cpython-311.pyc differ diff --git a/litellm/__pycache__/utils.cpython-311.pyc b/litellm/__pycache__/utils.cpython-311.pyc index fbebc9cd5..76b82a67d 100644 Binary files a/litellm/__pycache__/utils.cpython-311.pyc and b/litellm/__pycache__/utils.cpython-311.pyc differ diff --git a/litellm/llms/cohere.py b/litellm/llms/cohere.py new file mode 100644 index 000000000..113b6b542 --- /dev/null +++ b/litellm/llms/cohere.py @@ -0,0 +1,101 @@ +import os +import json +from enum import Enum +import requests +import time +from typing import Callable +from litellm.utils import ModelResponse + +class CohereError(Exception): + def __init__(self, status_code, message): + self.status_code = status_code + self.message = message + super().__init__( + self.message + ) # Call the base class constructor with the parameters it needs + +def validate_environment(api_key): + headers = { + "accept": "application/json", + "content-type": "application/json", + } + if api_key: + headers["Authorization"] = f"Bearer {api_key}" + return headers + +def completion( + model: str, + messages: list, + model_response: ModelResponse, + print_verbose: Callable, + encoding, + api_key, + logging_obj, + optional_params=None, + litellm_params=None, + logger_fn=None, +): + headers = validate_environment(api_key) + completion_url = "https://api.cohere.ai/v1/generate" + model = model + prompt = " ".join(message["content"] for message in messages) + data = { + "model": model, + "prompt": prompt, + **optional_params, + } + + ## LOGGING + logging_obj.pre_call( + input=prompt, + api_key=api_key, + additional_args={"complete_input_dict": data}, + ) + ## COMPLETION CALL + response = requests.post( + completion_url, headers=headers, data=json.dumps(data), stream=optional_params["stream"] if "stream" in optional_params else False + ) + if "stream" in optional_params and optional_params["stream"] == True: + return response.iter_lines() + else: + ## LOGGING + logging_obj.post_call( + input=prompt, + api_key=api_key, + original_response=response.text, + additional_args={"complete_input_dict": data}, + ) + print_verbose(f"raw model_response: {response.text}") + ## RESPONSE OBJECT + completion_response = response.json() + if "error" in completion_response: + raise CohereError( + message=completion_response["error"], + status_code=response.status_code, + ) + else: + try: + model_response["choices"][0]["message"]["content"] = completion_response["generations"][0]["text"] + except: + raise CohereError(message=json.dumps(completion_response), status_code=response.status_code) + + ## CALCULATING USAGE - baseten charges on time, not tokens - have some mapping of cost here. + prompt_tokens = len( + encoding.encode(prompt) + ) + completion_tokens = len( + encoding.encode(model_response["choices"][0]["message"]["content"]) + ) + + model_response["created"] = time.time() + model_response["model"] = model + model_response["usage"] = { + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "total_tokens": prompt_tokens + completion_tokens, + } + return model_response + +def embedding(): + # logic for parsing in - calling - parsing out model embedding calls + pass diff --git a/litellm/main.py b/litellm/main.py index 9edf423e0..26d54cf67 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -32,6 +32,7 @@ from .llms import nlp_cloud from .llms import baseten from .llms import vllm from .llms import ollama +from .llms import cohere import tiktoken from concurrent.futures import ThreadPoolExecutor from typing import Callable, List, Optional, Dict @@ -547,12 +548,6 @@ def completion( input=messages, api_key=openai.api_key, original_response=response ) elif model in litellm.cohere_models: - # import cohere/if it fails then pip install cohere - try: - import cohere - except: - raise Exception("Cohere import failed please run `pip install cohere`") - cohere_key = ( api_key or litellm.cohere_key @@ -560,35 +555,23 @@ def completion( or get_secret("CO_API_KEY") or litellm.api_key ) - co = cohere.Client(cohere_key) - prompt = " ".join([message["content"] for message in messages]) - ## LOGGING - logging.pre_call(input=prompt, api_key=cohere_key) - ## COMPLETION CALL - response = co.generate(model=model, prompt=prompt, **optional_params) + model_response = cohere.completion( + model=model, + messages=messages, + model_response=model_response, + print_verbose=print_verbose, + optional_params=optional_params, + litellm_params=litellm_params, + logger_fn=logger_fn, + encoding=encoding, + api_key=cohere_key, + logging_obj=logging # model call logging done inside the class as we make need to modify I/O to fit aleph alpha's requirements + ) + if "stream" in optional_params and optional_params["stream"] == True: # don't try to access stream object, - response = CustomStreamWrapper(response, model, logging_obj=logging) + response = CustomStreamWrapper(model_response, model, logging_obj=logging) return response - ## LOGGING - logging.post_call( - input=prompt, api_key=cohere_key, original_response=response - ) - ## USAGE - completion_response = response[0].text - prompt_tokens = len(encoding.encode(prompt)) - completion_tokens = len(encoding.encode(completion_response)) - ## RESPONSE OBJECT - model_response["choices"][0]["message"]["content"] = completion_response - if response[0].finish_reason: - model_response.choices[0].finish_reason = response[0].finish_reason - model_response["created"] = time.time() - model_response["model"] = model - model_response["usage"] = { - "prompt_tokens": prompt_tokens, - "completion_tokens": completion_tokens, - "total_tokens": prompt_tokens + completion_tokens, - } response = model_response elif ( ( diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py index 98faeb5c2..e8da75a8c 100644 --- a/litellm/tests/test_completion.py +++ b/litellm/tests/test_completion.py @@ -144,7 +144,7 @@ def test_completion_nlp_cloud_streaming(): except Exception as e: pytest.fail(f"Error occurred: {e}") -test_completion_nlp_cloud_streaming() +# test_completion_nlp_cloud_streaming() # def test_completion_hf_api(): # try: # user_message = "write some code to find the sum of two numbers" @@ -183,7 +183,6 @@ def test_completion_cohere(): # commenting for now as the cohere endpoint is bei # Add any assertions here to check the response print(response) response_str = response["choices"][0]["message"]["content"] - print(f"str response{response_str}") response_str_2 = response.choices[0].message.content if type(response_str) != str: pytest.fail(f"Error occurred: {e}") @@ -192,6 +191,8 @@ def test_completion_cohere(): # commenting for now as the cohere endpoint is bei except Exception as e: pytest.fail(f"Error occurred: {e}") +# test_completion_cohere() + def test_completion_cohere_stream(): try: messages = [ diff --git a/litellm/tests/test_streaming.py b/litellm/tests/test_streaming.py index c577567b7..43308aa16 100644 --- a/litellm/tests/test_streaming.py +++ b/litellm/tests/test_streaming.py @@ -3,7 +3,7 @@ import sys, os, asyncio import traceback -import time +import time, pytest sys.path.insert( 0, os.path.abspath("../..") @@ -24,6 +24,30 @@ def logger_fn(model_call_object: dict): user_message = "Hello, how are you?" messages = [{"content": user_message, "role": "user"}] +def test_completion_cohere_stream(): + try: + messages = [ + {"role": "system", "content": "You are a helpful assistant."}, + { + "role": "user", + "content": "how does a court case get to the Supreme Court?", + }, + ] + response = completion( + model="command-nightly", messages=messages, stream=True, max_tokens=50 + ) + complete_response = "" + # Add any assertions here to check the response + for chunk in response: + print(f"chunk: {chunk}") + complete_response += chunk["choices"][0]["delta"]["content"] + if complete_response == "": + raise Exception("Empty response received") + print(f"completion_response: {complete_response}") + except Exception as e: + pytest.fail(f"Error occurred: {e}") + + # test on baseten completion call # try: # response = completion( diff --git a/litellm/utils.py b/litellm/utils.py index 06149c83e..e0c29896e 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -1854,7 +1854,7 @@ def exception_type(model, original_exception, custom_llm_provider): llm_provider="replicate", model=model ) - elif model in litellm.cohere_models: # Cohere + elif model in litellm.cohere_models or custom_llm_provider == "cohere": # Cohere if ( "invalid api token" in error_str or "No API key provided." in error_str @@ -1872,6 +1872,21 @@ def exception_type(model, original_exception, custom_llm_provider): model=model, llm_provider="cohere", ) + elif hasattr(original_exception, "status_code"): + if original_exception.status_code == 400 or original_exception.status_code == 498: + exception_mapping_worked = True + raise InvalidRequestError( + message=f"CohereException - {original_exception.message}", + llm_provider="cohere", + model=model + ) + elif original_exception.status_code == 500: + exception_mapping_worked = True + raise ServiceUnavailableError( + message=f"CohereException - {original_exception.message}", + llm_provider="cohere", + model=model + ) elif ( "CohereConnectionError" in exception_type ): # cohere seems to fire these errors when we load test it (1k+ messages / min) @@ -2287,14 +2302,10 @@ class CustomStreamWrapper: self.model = model self.custom_llm_provider = custom_llm_provider self.logging_obj = logging_obj + self.completion_stream = completion_stream if self.logging_obj: # Log the type of the received item self.logging_obj.post_call(str(type(completion_stream))) - if model in litellm.cohere_models: - # these do not return an iterator, so we need to wrap it in one - self.completion_stream = iter(completion_stream) - else: - self.completion_stream = completion_stream def __iter__(self): return self @@ -2359,6 +2370,16 @@ class CustomStreamWrapper: except: raise ValueError(f"Unable to parse response. Original response: {chunk}") + def handle_cohere_chunk(self, chunk): + chunk = chunk.decode("utf-8") + print(f"cohere chunk: {chunk}") + data_json = json.loads(chunk) + try: + print(f"data json: {data_json}") + return data_json["text"] + except: + raise ValueError(f"Unable to parse response. Original response: {chunk}") + def handle_openai_text_completion_chunk(self, chunk): try: return chunk["choices"][0]["text"] @@ -2416,9 +2437,6 @@ class CustomStreamWrapper: if text_data == "": return self.__next__() completion_obj["content"] = text_data - elif self.model in litellm.cohere_models: - chunk = next(self.completion_stream) - completion_obj["content"] = chunk.text elif self.custom_llm_provider and self.custom_llm_provider == "huggingface": chunk = next(self.completion_stream) completion_obj["content"] = self.handle_huggingface_chunk(chunk) @@ -2440,6 +2458,9 @@ class CustomStreamWrapper: elif self.model in litellm.nlp_cloud_models or self.custom_llm_provider == "nlp_cloud": chunk = next(self.completion_stream) completion_obj["content"] = self.handle_nlp_cloud_chunk(chunk) + elif self.model in litellm.cohere_models or self.custom_llm_provider == "cohere": + chunk = next(self.completion_stream) + completion_obj["content"] = self.handle_cohere_chunk(chunk) else: # openai chat/azure models chunk = next(self.completion_stream) return chunk # open ai returns finish_reason, we should just return the openai chunk diff --git a/pyproject.toml b/pyproject.toml index 5d259cdc3..9a81f0f9d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "litellm" -version = "0.1.625" +version = "0.1.626" description = "Library to easily interface with LLM API providers" authors = ["BerriAI"] license = "MIT License"