add coverage for rate limit errors to togetherai

2023-08-29 12:54:56 -07:00 · 2023-08-29 12:54:56 -07:00 · f11599e50c
commit f11599e50c
parent 88bd1df3e0
7 changed files with 191 additions and 103 deletions
--- a/litellm/pycache/init.cpython-311.pyc
+++ b/litellm/pycache/init.cpython-311.pyc
--- a/litellm/pycache/main.cpython-311.pyc
+++ b/litellm/pycache/main.cpython-311.pyc
--- a/litellm/pycache/utils.cpython-311.pyc
+++ b/litellm/pycache/utils.cpython-311.pyc
--- a/litellm/llms/together_ai.py
+++ b/litellm/llms/together_ai.py
@ -0,0 +1,131 @@
 import os, json
 from enum import Enum
 import requests
 import time
 from typing import Callable
 from litellm.utils import ModelResponse
 class TogetherAIError(Exception):
    def __init__(self, status_code, message):
        self.status_code = status_code
        self.message = message
        super().__init__(
            self.message
        )  # Call the base class constructor with the parameters it needs
 class TogetherAILLM:
    def __init__(self, encoding, logging_obj, api_key=None):
        self.encoding = encoding
        self.completion_url = "https://api.together.xyz/inference"
        self.api_key = api_key
        self.logging_obj = logging_obj
        self.validate_environment(api_key=api_key)
    def validate_environment(
        self, api_key
    ):  # set up the environment required to run the model
        # set the api key
        if self.api_key == None:
            raise ValueError(
                "Missing TogetherAI API Key - A call is being made to together_ai but no key is set either in the environment variables or via params"
            )
        self.api_key = api_key
        self.headers = {
            "accept": "application/json",
            "content-type": "application/json",
            "Authorization": "Bearer " + self.api_key,
        }
    def completion(
        self,
        model: str,
        messages: list,
        model_response: ModelResponse,
        print_verbose: Callable,
        optional_params=None,
        litellm_params=None,
        logger_fn=None,
    ):  # logic for parsing in - calling - parsing out model completion calls
        model = model
        prompt = ""
        for message in messages:
            if "role" in message:
                if message["role"] == "user":
                    prompt += f"{message['content']}"
                else:
                    prompt += f"{message['content']}"
            else:
                prompt += f"{message['content']}"
        data = {
            "model": model,
            "prompt": prompt,
            "request_type": "language-model-inference",
            **optional_params,
        }
        ## LOGGING
        self.logging_obj.pre_call(
            input=prompt,
            api_key=self.api_key,
            additional_args={"complete_input_dict": data},
        )
        ## COMPLETION CALL
        if (
                "stream_tokens" in optional_params
                and optional_params["stream_tokens"] == True
            ):
            response = requests.post(
                self.completion_url,
                headers=self.headers,
                data=json.dumps(data),
                stream=optional_params["stream_tokens"],
            )
            return response.iter_lines()
        else:
            response = requests.post(
                self.completion_url,
                headers=self.headers,
                data=json.dumps(data)
            )
            ## LOGGING
            self.logging_obj.post_call(
                input=prompt,
                api_key=self.api_key,
                original_response=response.text,
                additional_args={"complete_input_dict": data},
            )
            print_verbose(f"raw model_response: {response.text}")
            ## RESPONSE OBJECT
            completion_response = response.json()
            if "error" in completion_response:
                raise TogetherAIError(
                    message=json.dumps(completion_response),
                    status_code=response.status_code,
                )
            elif "error" in completion_response["output"]:
                raise TogetherAIError(message=json.dumps(completion_response["output"]), status_code=response.status_code)
            completion_response = completion_response["output"]["choices"][0]["text"]
            ## CALCULATING USAGE - baseten charges on time, not tokens - have some mapping of cost here.
            prompt_tokens = len(self.encoding.encode(prompt))
            completion_tokens = len(
                self.encoding.encode(completion_response)
            )
            model_response["choices"][0]["message"]["content"] = completion_response
            model_response["created"] = time.time()
            model_response["model"] = model
            model_response["usage"] = {
                "prompt_tokens": prompt_tokens,
                "completion_tokens": completion_tokens,
                "total_tokens": prompt_tokens + completion_tokens,
            }
            return model_response
    def embedding(
        self,
    ):  # logic for parsing in - calling - parsing out model embedding calls
        pass
--- a/litellm/main.py
+++ b/litellm/main.py
@ -23,6 +23,7 @@ from .llms.anthropic import AnthropicLLM
 from .llms.huggingface_restapi import HuggingfaceRestAPILLM
 from .llms.baseten import BasetenLLM
 from .llms.ai21 import AI21LLM
 from .llms.together_ai import TogetherAILLM
 import tiktoken
 from concurrent.futures import ThreadPoolExecutor
@ -540,77 +541,29 @@ def completion(
            response = model_response
        elif custom_llm_provider == "together_ai" or ("togethercomputer" in model):
            custom_llm_provider = "together_ai"
-            import requests
+            together_ai_key = (
-
+                api_key
            TOGETHER_AI_TOKEN = (
                get_secret("TOGETHER_AI_TOKEN")
                or get_secret("TOGETHERAI_API_KEY")
                or get_secret("TOGETHER_AI_API_KEY")
                or api_key
                or litellm.togetherai_api_key
                or get_secret("TOGETHER_AI_TOKEN")
                or get_secret("TOGETHERAI_API_KEY")
            )
            headers = {"Authorization": f"Bearer {TOGETHER_AI_TOKEN}"}
            endpoint = "https://api.together.xyz/inference"
            prompt = " ".join(
                [message["content"] for message in messages]
            )  # TODO: Add chat support for together AI
-            ## LOGGING
+            together_ai_client = TogetherAILLM(encoding=encoding, api_key=together_ai_key, logging_obj=logging)
-            logging.pre_call(input=prompt, api_key=TOGETHER_AI_TOKEN)
+            model_response = together_ai_client.completion(
-
+                model=model,
-            print(f"TOGETHER_AI_TOKEN: {TOGETHER_AI_TOKEN}")
+                messages=messages,
-            if (
+                model_response=model_response,
-                "stream_tokens" in optional_params
+                print_verbose=print_verbose,
-                and optional_params["stream_tokens"] == True
+                optional_params=optional_params,
-            ):
+                litellm_params=litellm_params,
-                res = requests.post(
+                logger_fn=logger_fn,
                    endpoint,
                    json={
                        "model": model,
                        "prompt": prompt,
                        "request_type": "language-model-inference",
                        **optional_params,
                    },
                    stream=optional_params["stream_tokens"],
                    headers=headers,
            )
            if "stream_tokens" in optional_params and optional_params["stream_tokens"] == True:
                # don't try to access stream object,
                response = CustomStreamWrapper(
-                    res.iter_lines(), model, custom_llm_provider="together_ai", logging_obj=logging
+                    model_response, model, custom_llm_provider="together_ai", logging_obj=logging
                )
                return response
            else:
                res = requests.post(
                    endpoint,
                    json={
                        "model": model,
                        "prompt": prompt,
                        "request_type": "language-model-inference",
                        **optional_params,
                    },
                    headers=headers,
                )
                ## LOGGING
                logging.post_call(
                    input=prompt, api_key=TOGETHER_AI_TOKEN, original_response=res.text
                )
                # make this safe for reading, if output does not exist raise an error
                json_response = res.json()
                if "error" in json_response:
                    raise Exception(json.dumps(json_response))
                elif "error" in json_response["output"]:
                    raise Exception(json.dumps(json_response["output"]))
                completion_response = json_response["output"]["choices"][0]["text"]
                prompt_tokens = len(encoding.encode(prompt))
                completion_tokens = len(encoding.encode(completion_response))
                ## RESPONSE OBJECT
                model_response["choices"][0]["message"]["content"] = completion_response
                model_response["created"] = time.time()
                model_response["model"] = model
                model_response["usage"] = {
                    "prompt_tokens": prompt_tokens,
                    "completion_tokens": completion_tokens,
                    "total_tokens": prompt_tokens + completion_tokens,
                }
            response = model_response
        elif model in litellm.vertex_chat_models:
            import vertexai
--- a/litellm/tests/test_exceptions.py
+++ b/litellm/tests/test_exceptions.py
@ -144,42 +144,40 @@ def invalid_auth(model):  # set the model key to an invalid key, depending on th
 invalid_auth(test_model)
 # Test 3: Rate Limit Errors
-def test_model(model):
+# def test_model_call(model):
-    try:
+#     try:
-        sample_text = "how does a court case get to the Supreme Court?" * 50000
+#         sample_text = "how does a court case get to the Supreme Court?"
-        messages = [{ "content": sample_text,"role": "user"}]
+#         messages = [{ "content": sample_text,"role": "user"}]
-        custom_llm_provider = None
+#         print(f"model: {model}")
-        if model == "chatgpt-test":
+#         response = completion(model=model, messages=messages)
-            custom_llm_provider = "azure"
+#     except RateLimitError:
-        print(f"model: {model}")
+#         return True
-        response = completion(model=model, messages=messages, custom_llm_provider=custom_llm_provider)
+#     except OpenAIError: # is at least an openai error -> in case of random model errors - e.g. overloaded server
-    except RateLimitError:
+#         return True
-        return True
+#     except Exception as e:
-    except OpenAIError: # is at least an openai error -> in case of random model errors - e.g. overloaded server
+#         print(f"Uncaught Exception {model}: {type(e).__name__} - {e}")
-        return True
+#         traceback.print_exc()
-    except Exception as e:
+#         pass
-        print(f"Uncaught Exception {model}: {type(e).__name__} - {e}")
+#     return False
-        pass
+# # Repeat each model 500 times
-    return False
+# extended_models = [model for model in models for _ in range(250)]
-# Repeat each model 500 times
+# def worker(model):
-extended_models = [model for model in models for _ in range(250)]
+#     return test_model_call(model)
-def worker(model):
+# # Create a dictionary to store the results
-    return test_model(model)
+# counts = {True: 0, False: 0}
-# Create a dictionary to store the results
+# # Use Thread Pool Executor
-counts = {True: 0, False: 0}
+# with ThreadPoolExecutor(max_workers=500) as executor:
 #     # Use map to start the operation in thread pool
 #     results = executor.map(worker, extended_models)
-# Use Thread Pool Executor
+#     # Iterate over results and count True/False
-with ThreadPoolExecutor(max_workers=500) as executor:
+#     for result in results:
-    # Use map to start the operation in thread pool
+#         counts[result] += 1
    results = executor.map(worker, extended_models)
-    # Iterate over results and count True/False
+# accuracy_score = counts[True]/(counts[True] + counts[False])
-    for result in results:
+# print(f"accuracy_score: {accuracy_score}")
        counts[result] += 1
 accuracy_score = counts[True]/(counts[True] + counts[False])
 print(f"accuracy_score: {accuracy_score}")
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -1451,30 +1451,36 @@ def exception_type(model, original_exception, custom_llm_provider):
                if "error" in error_response and "`inputs` tokens + `max_new_tokens` must be <=" in error_response["error"]:
                    exception_mapping_worked = True
                    raise ContextWindowExceededError(
-                        message=error_response["error"],
+                        message=f"TogetherAIException - {error_response['error']}",
                        model=model,
                        llm_provider="together_ai"
                    )
                elif "error" in error_response and "invalid private key" in error_response["error"]:
                    exception_mapping_worked = True
                    raise AuthenticationError(
-                        message=error_response["error"],
+                        message=f"TogetherAIException - {error_response['error']}",
                        llm_provider="together_ai"
                    )
                elif "error" in error_response and "INVALID_ARGUMENT" in error_response["error"]:
                    exception_mapping_worked = True
                    raise InvalidRequestError(
-                        message=error_response["error"],
+                        message=f"TogetherAIException - {error_response['error']}",
                        model=model,
                        llm_provider="together_ai"
                    )
                elif "error_type" in error_response and error_response["error_type"] == "validation":
                    exception_mapping_worked = True
                    raise InvalidRequestError(
-                        message=error_response["error"],
+                        message=f"TogetherAIException - {error_response['error']}",
                        model=model,
                        llm_provider="together_ai"
                    )
                elif original_exception.status_code == 429:
                        exception_mapping_worked = True
                        raise RateLimitError(
                            message=f"TogetherAIException - {original_exception.message}",
                            llm_provider="together_ai",
                        )
                print(f"error: {error_response}")
                print(f"e: {original_exception}")
            raise original_exception  # base case - return the original exception