diff --git a/litellm/__pycache__/__init__.cpython-311.pyc b/litellm/__pycache__/__init__.cpython-311.pyc index a37c35ddb..5c001e9ab 100644 Binary files a/litellm/__pycache__/__init__.cpython-311.pyc and b/litellm/__pycache__/__init__.cpython-311.pyc differ diff --git a/litellm/__pycache__/main.cpython-311.pyc b/litellm/__pycache__/main.cpython-311.pyc index 0fdab360c..c22fe2c58 100644 Binary files a/litellm/__pycache__/main.cpython-311.pyc and b/litellm/__pycache__/main.cpython-311.pyc differ diff --git a/litellm/__pycache__/utils.cpython-311.pyc b/litellm/__pycache__/utils.cpython-311.pyc index b8c0d351b..bf7b112ca 100644 Binary files a/litellm/__pycache__/utils.cpython-311.pyc and b/litellm/__pycache__/utils.cpython-311.pyc differ diff --git a/litellm/llms/together_ai.py b/litellm/llms/together_ai.py new file mode 100644 index 000000000..4712000a9 --- /dev/null +++ b/litellm/llms/together_ai.py @@ -0,0 +1,131 @@ +import os, json +from enum import Enum +import requests +import time +from typing import Callable +from litellm.utils import ModelResponse + + +class TogetherAIError(Exception): + def __init__(self, status_code, message): + self.status_code = status_code + self.message = message + super().__init__( + self.message + ) # Call the base class constructor with the parameters it needs + + +class TogetherAILLM: + def __init__(self, encoding, logging_obj, api_key=None): + self.encoding = encoding + self.completion_url = "https://api.together.xyz/inference" + self.api_key = api_key + self.logging_obj = logging_obj + self.validate_environment(api_key=api_key) + + def validate_environment( + self, api_key + ): # set up the environment required to run the model + # set the api key + if self.api_key == None: + raise ValueError( + "Missing TogetherAI API Key - A call is being made to together_ai but no key is set either in the environment variables or via params" + ) + self.api_key = api_key + self.headers = { + "accept": "application/json", + "content-type": "application/json", + "Authorization": "Bearer " + self.api_key, + } + + def completion( + self, + model: str, + messages: list, + model_response: ModelResponse, + print_verbose: Callable, + optional_params=None, + litellm_params=None, + logger_fn=None, + ): # logic for parsing in - calling - parsing out model completion calls + model = model + prompt = "" + for message in messages: + if "role" in message: + if message["role"] == "user": + prompt += f"{message['content']}" + else: + prompt += f"{message['content']}" + else: + prompt += f"{message['content']}" + data = { + "model": model, + "prompt": prompt, + "request_type": "language-model-inference", + **optional_params, + } + + ## LOGGING + self.logging_obj.pre_call( + input=prompt, + api_key=self.api_key, + additional_args={"complete_input_dict": data}, + ) + ## COMPLETION CALL + if ( + "stream_tokens" in optional_params + and optional_params["stream_tokens"] == True + ): + response = requests.post( + self.completion_url, + headers=self.headers, + data=json.dumps(data), + stream=optional_params["stream_tokens"], + ) + return response.iter_lines() + else: + response = requests.post( + self.completion_url, + headers=self.headers, + data=json.dumps(data) + ) + ## LOGGING + self.logging_obj.post_call( + input=prompt, + api_key=self.api_key, + original_response=response.text, + additional_args={"complete_input_dict": data}, + ) + print_verbose(f"raw model_response: {response.text}") + ## RESPONSE OBJECT + completion_response = response.json() + + if "error" in completion_response: + raise TogetherAIError( + message=json.dumps(completion_response), + status_code=response.status_code, + ) + elif "error" in completion_response["output"]: + raise TogetherAIError(message=json.dumps(completion_response["output"]), status_code=response.status_code) + + completion_response = completion_response["output"]["choices"][0]["text"] + + ## CALCULATING USAGE - baseten charges on time, not tokens - have some mapping of cost here. + prompt_tokens = len(self.encoding.encode(prompt)) + completion_tokens = len( + self.encoding.encode(completion_response) + ) + model_response["choices"][0]["message"]["content"] = completion_response + model_response["created"] = time.time() + model_response["model"] = model + model_response["usage"] = { + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "total_tokens": prompt_tokens + completion_tokens, + } + return model_response + + def embedding( + self, + ): # logic for parsing in - calling - parsing out model embedding calls + pass diff --git a/litellm/main.py b/litellm/main.py index 2a68e0469..0aa791673 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -23,6 +23,7 @@ from .llms.anthropic import AnthropicLLM from .llms.huggingface_restapi import HuggingfaceRestAPILLM from .llms.baseten import BasetenLLM from .llms.ai21 import AI21LLM +from .llms.together_ai import TogetherAILLM import tiktoken from concurrent.futures import ThreadPoolExecutor @@ -540,78 +541,30 @@ def completion( response = model_response elif custom_llm_provider == "together_ai" or ("togethercomputer" in model): custom_llm_provider = "together_ai" - import requests - - TOGETHER_AI_TOKEN = ( - get_secret("TOGETHER_AI_TOKEN") - or get_secret("TOGETHERAI_API_KEY") - or get_secret("TOGETHER_AI_API_KEY") - or api_key + together_ai_key = ( + api_key or litellm.togetherai_api_key + or get_secret("TOGETHER_AI_TOKEN") + or get_secret("TOGETHERAI_API_KEY") ) - headers = {"Authorization": f"Bearer {TOGETHER_AI_TOKEN}"} - endpoint = "https://api.together.xyz/inference" - prompt = " ".join( - [message["content"] for message in messages] - ) # TODO: Add chat support for together AI - ## LOGGING - logging.pre_call(input=prompt, api_key=TOGETHER_AI_TOKEN) - - print(f"TOGETHER_AI_TOKEN: {TOGETHER_AI_TOKEN}") - if ( - "stream_tokens" in optional_params - and optional_params["stream_tokens"] == True - ): - res = requests.post( - endpoint, - json={ - "model": model, - "prompt": prompt, - "request_type": "language-model-inference", - **optional_params, - }, - stream=optional_params["stream_tokens"], - headers=headers, - ) + together_ai_client = TogetherAILLM(encoding=encoding, api_key=together_ai_key, logging_obj=logging) + model_response = together_ai_client.completion( + model=model, + messages=messages, + model_response=model_response, + print_verbose=print_verbose, + optional_params=optional_params, + litellm_params=litellm_params, + logger_fn=logger_fn, + ) + if "stream_tokens" in optional_params and optional_params["stream_tokens"] == True: + # don't try to access stream object, response = CustomStreamWrapper( - res.iter_lines(), model, custom_llm_provider="together_ai", logging_obj=logging + model_response, model, custom_llm_provider="together_ai", logging_obj=logging ) return response - else: - res = requests.post( - endpoint, - json={ - "model": model, - "prompt": prompt, - "request_type": "language-model-inference", - **optional_params, - }, - headers=headers, - ) - ## LOGGING - logging.post_call( - input=prompt, api_key=TOGETHER_AI_TOKEN, original_response=res.text - ) - # make this safe for reading, if output does not exist raise an error - json_response = res.json() - if "error" in json_response: - raise Exception(json.dumps(json_response)) - elif "error" in json_response["output"]: - raise Exception(json.dumps(json_response["output"])) - completion_response = json_response["output"]["choices"][0]["text"] - prompt_tokens = len(encoding.encode(prompt)) - completion_tokens = len(encoding.encode(completion_response)) - ## RESPONSE OBJECT - model_response["choices"][0]["message"]["content"] = completion_response - model_response["created"] = time.time() - model_response["model"] = model - model_response["usage"] = { - "prompt_tokens": prompt_tokens, - "completion_tokens": completion_tokens, - "total_tokens": prompt_tokens + completion_tokens, - } - response = model_response + response = model_response elif model in litellm.vertex_chat_models: import vertexai from vertexai.preview.language_models import ChatModel, InputOutputTextPair diff --git a/litellm/tests/test_exceptions.py b/litellm/tests/test_exceptions.py index 3367bfa16..396cdc079 100644 --- a/litellm/tests/test_exceptions.py +++ b/litellm/tests/test_exceptions.py @@ -144,42 +144,40 @@ def invalid_auth(model): # set the model key to an invalid key, depending on th invalid_auth(test_model) + # Test 3: Rate Limit Errors -def test_model(model): - try: - sample_text = "how does a court case get to the Supreme Court?" * 50000 - messages = [{ "content": sample_text,"role": "user"}] - custom_llm_provider = None - if model == "chatgpt-test": - custom_llm_provider = "azure" - print(f"model: {model}") - response = completion(model=model, messages=messages, custom_llm_provider=custom_llm_provider) - except RateLimitError: - return True - except OpenAIError: # is at least an openai error -> in case of random model errors - e.g. overloaded server - return True - except Exception as e: - print(f"Uncaught Exception {model}: {type(e).__name__} - {e}") - pass - return False +# def test_model_call(model): +# try: +# sample_text = "how does a court case get to the Supreme Court?" +# messages = [{ "content": sample_text,"role": "user"}] +# print(f"model: {model}") +# response = completion(model=model, messages=messages) +# except RateLimitError: +# return True +# except OpenAIError: # is at least an openai error -> in case of random model errors - e.g. overloaded server +# return True +# except Exception as e: +# print(f"Uncaught Exception {model}: {type(e).__name__} - {e}") +# traceback.print_exc() +# pass +# return False +# # Repeat each model 500 times +# extended_models = [model for model in models for _ in range(250)] -# Repeat each model 500 times -extended_models = [model for model in models for _ in range(250)] +# def worker(model): +# return test_model_call(model) -def worker(model): - return test_model(model) +# # Create a dictionary to store the results +# counts = {True: 0, False: 0} -# Create a dictionary to store the results -counts = {True: 0, False: 0} +# # Use Thread Pool Executor +# with ThreadPoolExecutor(max_workers=500) as executor: +# # Use map to start the operation in thread pool +# results = executor.map(worker, extended_models) -# Use Thread Pool Executor -with ThreadPoolExecutor(max_workers=500) as executor: - # Use map to start the operation in thread pool - results = executor.map(worker, extended_models) +# # Iterate over results and count True/False +# for result in results: +# counts[result] += 1 - # Iterate over results and count True/False - for result in results: - counts[result] += 1 - -accuracy_score = counts[True]/(counts[True] + counts[False]) -print(f"accuracy_score: {accuracy_score}") +# accuracy_score = counts[True]/(counts[True] + counts[False]) +# print(f"accuracy_score: {accuracy_score}") diff --git a/litellm/utils.py b/litellm/utils.py index 777c9f3b6..994b4916c 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -1451,30 +1451,36 @@ def exception_type(model, original_exception, custom_llm_provider): if "error" in error_response and "`inputs` tokens + `max_new_tokens` must be <=" in error_response["error"]: exception_mapping_worked = True raise ContextWindowExceededError( - message=error_response["error"], + message=f"TogetherAIException - {error_response['error']}", model=model, llm_provider="together_ai" ) elif "error" in error_response and "invalid private key" in error_response["error"]: exception_mapping_worked = True raise AuthenticationError( - message=error_response["error"], + message=f"TogetherAIException - {error_response['error']}", llm_provider="together_ai" ) elif "error" in error_response and "INVALID_ARGUMENT" in error_response["error"]: exception_mapping_worked = True raise InvalidRequestError( - message=error_response["error"], + message=f"TogetherAIException - {error_response['error']}", model=model, llm_provider="together_ai" ) elif "error_type" in error_response and error_response["error_type"] == "validation": exception_mapping_worked = True raise InvalidRequestError( - message=error_response["error"], + message=f"TogetherAIException - {error_response['error']}", model=model, llm_provider="together_ai" ) + elif original_exception.status_code == 429: + exception_mapping_worked = True + raise RateLimitError( + message=f"TogetherAIException - {original_exception.message}", + llm_provider="together_ai", + ) print(f"error: {error_response}") print(f"e: {original_exception}") raise original_exception # base case - return the original exception