move cohere to http endpoint

This commit is contained in:
Krrish Dholakia 2023-09-14 11:17:38 -07:00
parent 70a36740bc
commit 3b4064a58f
8 changed files with 175 additions and 45 deletions

101
litellm/llms/cohere.py Normal file
View file

@ -0,0 +1,101 @@
import os
import json
from enum import Enum
import requests
import time
from typing import Callable
from litellm.utils import ModelResponse
class CohereError(Exception):
def __init__(self, status_code, message):
self.status_code = status_code
self.message = message
super().__init__(
self.message
) # Call the base class constructor with the parameters it needs
def validate_environment(api_key):
headers = {
"accept": "application/json",
"content-type": "application/json",
}
if api_key:
headers["Authorization"] = f"Bearer {api_key}"
return headers
def completion(
model: str,
messages: list,
model_response: ModelResponse,
print_verbose: Callable,
encoding,
api_key,
logging_obj,
optional_params=None,
litellm_params=None,
logger_fn=None,
):
headers = validate_environment(api_key)
completion_url = "https://api.cohere.ai/v1/generate"
model = model
prompt = " ".join(message["content"] for message in messages)
data = {
"model": model,
"prompt": prompt,
**optional_params,
}
## LOGGING
logging_obj.pre_call(
input=prompt,
api_key=api_key,
additional_args={"complete_input_dict": data},
)
## COMPLETION CALL
response = requests.post(
completion_url, headers=headers, data=json.dumps(data), stream=optional_params["stream"] if "stream" in optional_params else False
)
if "stream" in optional_params and optional_params["stream"] == True:
return response.iter_lines()
else:
## LOGGING
logging_obj.post_call(
input=prompt,
api_key=api_key,
original_response=response.text,
additional_args={"complete_input_dict": data},
)
print_verbose(f"raw model_response: {response.text}")
## RESPONSE OBJECT
completion_response = response.json()
if "error" in completion_response:
raise CohereError(
message=completion_response["error"],
status_code=response.status_code,
)
else:
try:
model_response["choices"][0]["message"]["content"] = completion_response["generations"][0]["text"]
except:
raise CohereError(message=json.dumps(completion_response), status_code=response.status_code)
## CALCULATING USAGE - baseten charges on time, not tokens - have some mapping of cost here.
prompt_tokens = len(
encoding.encode(prompt)
)
completion_tokens = len(
encoding.encode(model_response["choices"][0]["message"]["content"])
)
model_response["created"] = time.time()
model_response["model"] = model
model_response["usage"] = {
"prompt_tokens": prompt_tokens,
"completion_tokens": completion_tokens,
"total_tokens": prompt_tokens + completion_tokens,
}
return model_response
def embedding():
# logic for parsing in - calling - parsing out model embedding calls
pass

View file

@ -32,6 +32,7 @@ from .llms import nlp_cloud
from .llms import baseten from .llms import baseten
from .llms import vllm from .llms import vllm
from .llms import ollama from .llms import ollama
from .llms import cohere
import tiktoken import tiktoken
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from typing import Callable, List, Optional, Dict from typing import Callable, List, Optional, Dict
@ -547,12 +548,6 @@ def completion(
input=messages, api_key=openai.api_key, original_response=response input=messages, api_key=openai.api_key, original_response=response
) )
elif model in litellm.cohere_models: elif model in litellm.cohere_models:
# import cohere/if it fails then pip install cohere
try:
import cohere
except:
raise Exception("Cohere import failed please run `pip install cohere`")
cohere_key = ( cohere_key = (
api_key api_key
or litellm.cohere_key or litellm.cohere_key
@ -560,35 +555,23 @@ def completion(
or get_secret("CO_API_KEY") or get_secret("CO_API_KEY")
or litellm.api_key or litellm.api_key
) )
co = cohere.Client(cohere_key) model_response = cohere.completion(
prompt = " ".join([message["content"] for message in messages]) model=model,
## LOGGING messages=messages,
logging.pre_call(input=prompt, api_key=cohere_key) model_response=model_response,
## COMPLETION CALL print_verbose=print_verbose,
response = co.generate(model=model, prompt=prompt, **optional_params) optional_params=optional_params,
litellm_params=litellm_params,
logger_fn=logger_fn,
encoding=encoding,
api_key=cohere_key,
logging_obj=logging # model call logging done inside the class as we make need to modify I/O to fit aleph alpha's requirements
)
if "stream" in optional_params and optional_params["stream"] == True: if "stream" in optional_params and optional_params["stream"] == True:
# don't try to access stream object, # don't try to access stream object,
response = CustomStreamWrapper(response, model, logging_obj=logging) response = CustomStreamWrapper(model_response, model, logging_obj=logging)
return response return response
## LOGGING
logging.post_call(
input=prompt, api_key=cohere_key, original_response=response
)
## USAGE
completion_response = response[0].text
prompt_tokens = len(encoding.encode(prompt))
completion_tokens = len(encoding.encode(completion_response))
## RESPONSE OBJECT
model_response["choices"][0]["message"]["content"] = completion_response
if response[0].finish_reason:
model_response.choices[0].finish_reason = response[0].finish_reason
model_response["created"] = time.time()
model_response["model"] = model
model_response["usage"] = {
"prompt_tokens": prompt_tokens,
"completion_tokens": completion_tokens,
"total_tokens": prompt_tokens + completion_tokens,
}
response = model_response response = model_response
elif ( elif (
( (

View file

@ -144,7 +144,7 @@ def test_completion_nlp_cloud_streaming():
except Exception as e: except Exception as e:
pytest.fail(f"Error occurred: {e}") pytest.fail(f"Error occurred: {e}")
test_completion_nlp_cloud_streaming() # test_completion_nlp_cloud_streaming()
# def test_completion_hf_api(): # def test_completion_hf_api():
# try: # try:
# user_message = "write some code to find the sum of two numbers" # user_message = "write some code to find the sum of two numbers"
@ -183,7 +183,6 @@ def test_completion_cohere(): # commenting for now as the cohere endpoint is bei
# Add any assertions here to check the response # Add any assertions here to check the response
print(response) print(response)
response_str = response["choices"][0]["message"]["content"] response_str = response["choices"][0]["message"]["content"]
print(f"str response{response_str}")
response_str_2 = response.choices[0].message.content response_str_2 = response.choices[0].message.content
if type(response_str) != str: if type(response_str) != str:
pytest.fail(f"Error occurred: {e}") pytest.fail(f"Error occurred: {e}")
@ -192,6 +191,8 @@ def test_completion_cohere(): # commenting for now as the cohere endpoint is bei
except Exception as e: except Exception as e:
pytest.fail(f"Error occurred: {e}") pytest.fail(f"Error occurred: {e}")
# test_completion_cohere()
def test_completion_cohere_stream(): def test_completion_cohere_stream():
try: try:
messages = [ messages = [

View file

@ -3,7 +3,7 @@
import sys, os, asyncio import sys, os, asyncio
import traceback import traceback
import time import time, pytest
sys.path.insert( sys.path.insert(
0, os.path.abspath("../..") 0, os.path.abspath("../..")
@ -24,6 +24,30 @@ def logger_fn(model_call_object: dict):
user_message = "Hello, how are you?" user_message = "Hello, how are you?"
messages = [{"content": user_message, "role": "user"}] messages = [{"content": user_message, "role": "user"}]
def test_completion_cohere_stream():
try:
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{
"role": "user",
"content": "how does a court case get to the Supreme Court?",
},
]
response = completion(
model="command-nightly", messages=messages, stream=True, max_tokens=50
)
complete_response = ""
# Add any assertions here to check the response
for chunk in response:
print(f"chunk: {chunk}")
complete_response += chunk["choices"][0]["delta"]["content"]
if complete_response == "":
raise Exception("Empty response received")
print(f"completion_response: {complete_response}")
except Exception as e:
pytest.fail(f"Error occurred: {e}")
# test on baseten completion call # test on baseten completion call
# try: # try:
# response = completion( # response = completion(

View file

@ -1854,7 +1854,7 @@ def exception_type(model, original_exception, custom_llm_provider):
llm_provider="replicate", llm_provider="replicate",
model=model model=model
) )
elif model in litellm.cohere_models: # Cohere elif model in litellm.cohere_models or custom_llm_provider == "cohere": # Cohere
if ( if (
"invalid api token" in error_str "invalid api token" in error_str
or "No API key provided." in error_str or "No API key provided." in error_str
@ -1872,6 +1872,21 @@ def exception_type(model, original_exception, custom_llm_provider):
model=model, model=model,
llm_provider="cohere", llm_provider="cohere",
) )
elif hasattr(original_exception, "status_code"):
if original_exception.status_code == 400 or original_exception.status_code == 498:
exception_mapping_worked = True
raise InvalidRequestError(
message=f"CohereException - {original_exception.message}",
llm_provider="cohere",
model=model
)
elif original_exception.status_code == 500:
exception_mapping_worked = True
raise ServiceUnavailableError(
message=f"CohereException - {original_exception.message}",
llm_provider="cohere",
model=model
)
elif ( elif (
"CohereConnectionError" in exception_type "CohereConnectionError" in exception_type
): # cohere seems to fire these errors when we load test it (1k+ messages / min) ): # cohere seems to fire these errors when we load test it (1k+ messages / min)
@ -2287,14 +2302,10 @@ class CustomStreamWrapper:
self.model = model self.model = model
self.custom_llm_provider = custom_llm_provider self.custom_llm_provider = custom_llm_provider
self.logging_obj = logging_obj self.logging_obj = logging_obj
self.completion_stream = completion_stream
if self.logging_obj: if self.logging_obj:
# Log the type of the received item # Log the type of the received item
self.logging_obj.post_call(str(type(completion_stream))) self.logging_obj.post_call(str(type(completion_stream)))
if model in litellm.cohere_models:
# these do not return an iterator, so we need to wrap it in one
self.completion_stream = iter(completion_stream)
else:
self.completion_stream = completion_stream
def __iter__(self): def __iter__(self):
return self return self
@ -2359,6 +2370,16 @@ class CustomStreamWrapper:
except: except:
raise ValueError(f"Unable to parse response. Original response: {chunk}") raise ValueError(f"Unable to parse response. Original response: {chunk}")
def handle_cohere_chunk(self, chunk):
chunk = chunk.decode("utf-8")
print(f"cohere chunk: {chunk}")
data_json = json.loads(chunk)
try:
print(f"data json: {data_json}")
return data_json["text"]
except:
raise ValueError(f"Unable to parse response. Original response: {chunk}")
def handle_openai_text_completion_chunk(self, chunk): def handle_openai_text_completion_chunk(self, chunk):
try: try:
return chunk["choices"][0]["text"] return chunk["choices"][0]["text"]
@ -2416,9 +2437,6 @@ class CustomStreamWrapper:
if text_data == "": if text_data == "":
return self.__next__() return self.__next__()
completion_obj["content"] = text_data completion_obj["content"] = text_data
elif self.model in litellm.cohere_models:
chunk = next(self.completion_stream)
completion_obj["content"] = chunk.text
elif self.custom_llm_provider and self.custom_llm_provider == "huggingface": elif self.custom_llm_provider and self.custom_llm_provider == "huggingface":
chunk = next(self.completion_stream) chunk = next(self.completion_stream)
completion_obj["content"] = self.handle_huggingface_chunk(chunk) completion_obj["content"] = self.handle_huggingface_chunk(chunk)
@ -2440,6 +2458,9 @@ class CustomStreamWrapper:
elif self.model in litellm.nlp_cloud_models or self.custom_llm_provider == "nlp_cloud": elif self.model in litellm.nlp_cloud_models or self.custom_llm_provider == "nlp_cloud":
chunk = next(self.completion_stream) chunk = next(self.completion_stream)
completion_obj["content"] = self.handle_nlp_cloud_chunk(chunk) completion_obj["content"] = self.handle_nlp_cloud_chunk(chunk)
elif self.model in litellm.cohere_models or self.custom_llm_provider == "cohere":
chunk = next(self.completion_stream)
completion_obj["content"] = self.handle_cohere_chunk(chunk)
else: # openai chat/azure models else: # openai chat/azure models
chunk = next(self.completion_stream) chunk = next(self.completion_stream)
return chunk # open ai returns finish_reason, we should just return the openai chunk return chunk # open ai returns finish_reason, we should just return the openai chunk

View file

@ -1,6 +1,6 @@
[tool.poetry] [tool.poetry]
name = "litellm" name = "litellm"
version = "0.1.625" version = "0.1.626"
description = "Library to easily interface with LLM API providers" description = "Library to easily interface with LLM API providers"
authors = ["BerriAI"] authors = ["BerriAI"]
license = "MIT License" license = "MIT License"