diff --git a/litellm/__pycache__/__init__.cpython-311.pyc b/litellm/__pycache__/__init__.cpython-311.pyc
index a37c35ddb..5c001e9ab 100644
Binary files a/litellm/__pycache__/__init__.cpython-311.pyc and b/litellm/__pycache__/__init__.cpython-311.pyc differ
diff --git a/litellm/__pycache__/main.cpython-311.pyc b/litellm/__pycache__/main.cpython-311.pyc
index 0fdab360c..c22fe2c58 100644
Binary files a/litellm/__pycache__/main.cpython-311.pyc and b/litellm/__pycache__/main.cpython-311.pyc differ
diff --git a/litellm/__pycache__/utils.cpython-311.pyc b/litellm/__pycache__/utils.cpython-311.pyc
index b8c0d351b..bf7b112ca 100644
Binary files a/litellm/__pycache__/utils.cpython-311.pyc and b/litellm/__pycache__/utils.cpython-311.pyc differ
diff --git a/litellm/llms/together_ai.py b/litellm/llms/together_ai.py
new file mode 100644
index 000000000..4712000a9
--- /dev/null
+++ b/litellm/llms/together_ai.py
@@ -0,0 +1,131 @@
+import os, json
+from enum import Enum
+import requests
+import time
+from typing import Callable
+from litellm.utils import ModelResponse
+
+
+class TogetherAIError(Exception):
+    def __init__(self, status_code, message):
+        self.status_code = status_code
+        self.message = message
+        super().__init__(
+            self.message
+        )  # Call the base class constructor with the parameters it needs
+
+
+class TogetherAILLM:
+    def __init__(self, encoding, logging_obj, api_key=None):
+        self.encoding = encoding
+        self.completion_url = "https://api.together.xyz/inference"
+        self.api_key = api_key
+        self.logging_obj = logging_obj
+        self.validate_environment(api_key=api_key)
+
+    def validate_environment(
+        self, api_key
+    ):  # set up the environment required to run the model
+        # set the api key
+        if self.api_key == None:
+            raise ValueError(
+                "Missing TogetherAI API Key - A call is being made to together_ai but no key is set either in the environment variables or via params"
+            )
+        self.api_key = api_key
+        self.headers = {
+            "accept": "application/json",
+            "content-type": "application/json",
+            "Authorization": "Bearer " + self.api_key,
+        }
+
+    def completion(
+        self,
+        model: str,
+        messages: list,
+        model_response: ModelResponse,
+        print_verbose: Callable,
+        optional_params=None,
+        litellm_params=None,
+        logger_fn=None,
+    ):  # logic for parsing in - calling - parsing out model completion calls
+        model = model
+        prompt = ""
+        for message in messages:
+            if "role" in message:
+                if message["role"] == "user":
+                    prompt += f"{message['content']}"
+                else:
+                    prompt += f"{message['content']}"
+            else:
+                prompt += f"{message['content']}"
+        data = {
+            "model": model,
+            "prompt": prompt,
+            "request_type": "language-model-inference",
+            **optional_params,
+        }
+
+        ## LOGGING
+        self.logging_obj.pre_call(
+            input=prompt,
+            api_key=self.api_key,
+            additional_args={"complete_input_dict": data},
+        )
+        ## COMPLETION CALL
+        if (
+                "stream_tokens" in optional_params
+                and optional_params["stream_tokens"] == True
+            ):
+            response = requests.post(
+                self.completion_url,
+                headers=self.headers,
+                data=json.dumps(data),
+                stream=optional_params["stream_tokens"],
+            )
+            return response.iter_lines()
+        else:
+            response = requests.post(
+                self.completion_url,
+                headers=self.headers,
+                data=json.dumps(data)
+            )
+            ## LOGGING
+            self.logging_obj.post_call(
+                input=prompt,
+                api_key=self.api_key,
+                original_response=response.text,
+                additional_args={"complete_input_dict": data},
+            )
+            print_verbose(f"raw model_response: {response.text}")
+            ## RESPONSE OBJECT
+            completion_response = response.json()
+
+            if "error" in completion_response:
+                raise TogetherAIError(
+                    message=json.dumps(completion_response),
+                    status_code=response.status_code,
+                )
+            elif "error" in completion_response["output"]:
+                raise TogetherAIError(message=json.dumps(completion_response["output"]), status_code=response.status_code)
+            
+            completion_response = completion_response["output"]["choices"][0]["text"]
+
+            ## CALCULATING USAGE - baseten charges on time, not tokens - have some mapping of cost here.
+            prompt_tokens = len(self.encoding.encode(prompt))
+            completion_tokens = len(
+                self.encoding.encode(completion_response)
+            )
+            model_response["choices"][0]["message"]["content"] = completion_response
+            model_response["created"] = time.time()
+            model_response["model"] = model
+            model_response["usage"] = {
+                "prompt_tokens": prompt_tokens,
+                "completion_tokens": completion_tokens,
+                "total_tokens": prompt_tokens + completion_tokens,
+            }
+            return model_response
+
+    def embedding(
+        self,
+    ):  # logic for parsing in - calling - parsing out model embedding calls
+        pass
diff --git a/litellm/main.py b/litellm/main.py
index 2a68e0469..0aa791673 100644
--- a/litellm/main.py
+++ b/litellm/main.py
@@ -23,6 +23,7 @@ from .llms.anthropic import AnthropicLLM
 from .llms.huggingface_restapi import HuggingfaceRestAPILLM
 from .llms.baseten import BasetenLLM
 from .llms.ai21 import AI21LLM
+from .llms.together_ai import TogetherAILLM
 import tiktoken
 from concurrent.futures import ThreadPoolExecutor
 
@@ -540,78 +541,30 @@ def completion(
             response = model_response
         elif custom_llm_provider == "together_ai" or ("togethercomputer" in model):
             custom_llm_provider = "together_ai"
-            import requests
-
-            TOGETHER_AI_TOKEN = (
-                get_secret("TOGETHER_AI_TOKEN")
-                or get_secret("TOGETHERAI_API_KEY")
-                or get_secret("TOGETHER_AI_API_KEY")
-                or api_key
+            together_ai_key = (
+                api_key
                 or litellm.togetherai_api_key
+                or get_secret("TOGETHER_AI_TOKEN")
+                or get_secret("TOGETHERAI_API_KEY")
             )
-            headers = {"Authorization": f"Bearer {TOGETHER_AI_TOKEN}"}
-            endpoint = "https://api.together.xyz/inference"
-            prompt = " ".join(
-                [message["content"] for message in messages]
-            )  # TODO: Add chat support for together AI
 
-            ## LOGGING
-            logging.pre_call(input=prompt, api_key=TOGETHER_AI_TOKEN)
-
-            print(f"TOGETHER_AI_TOKEN: {TOGETHER_AI_TOKEN}")
-            if (
-                "stream_tokens" in optional_params
-                and optional_params["stream_tokens"] == True
-            ):
-                res = requests.post(
-                    endpoint,
-                    json={
-                        "model": model,
-                        "prompt": prompt,
-                        "request_type": "language-model-inference",
-                        **optional_params,
-                    },
-                    stream=optional_params["stream_tokens"],
-                    headers=headers,
-                )
+            together_ai_client = TogetherAILLM(encoding=encoding, api_key=together_ai_key, logging_obj=logging)
+            model_response = together_ai_client.completion(
+                model=model,
+                messages=messages,
+                model_response=model_response,
+                print_verbose=print_verbose,
+                optional_params=optional_params,
+                litellm_params=litellm_params,
+                logger_fn=logger_fn,
+            )
+            if "stream_tokens" in optional_params and optional_params["stream_tokens"] == True:
+                # don't try to access stream object,
                 response = CustomStreamWrapper(
-                    res.iter_lines(), model, custom_llm_provider="together_ai", logging_obj=logging
+                    model_response, model, custom_llm_provider="together_ai", logging_obj=logging
                 )
                 return response
-            else:
-                res = requests.post(
-                    endpoint,
-                    json={
-                        "model": model,
-                        "prompt": prompt,
-                        "request_type": "language-model-inference",
-                        **optional_params,
-                    },
-                    headers=headers,
-                )
-                ## LOGGING
-                logging.post_call(
-                    input=prompt, api_key=TOGETHER_AI_TOKEN, original_response=res.text
-                )
-                # make this safe for reading, if output does not exist raise an error
-                json_response = res.json()
-                if "error" in json_response:
-                    raise Exception(json.dumps(json_response))
-                elif "error" in json_response["output"]:
-                    raise Exception(json.dumps(json_response["output"]))
-                completion_response = json_response["output"]["choices"][0]["text"]
-                prompt_tokens = len(encoding.encode(prompt))
-                completion_tokens = len(encoding.encode(completion_response))
-                ## RESPONSE OBJECT
-                model_response["choices"][0]["message"]["content"] = completion_response
-                model_response["created"] = time.time()
-                model_response["model"] = model
-                model_response["usage"] = {
-                    "prompt_tokens": prompt_tokens,
-                    "completion_tokens": completion_tokens,
-                    "total_tokens": prompt_tokens + completion_tokens,
-                }
-                response = model_response
+            response = model_response
         elif model in litellm.vertex_chat_models:
             import vertexai
             from vertexai.preview.language_models import ChatModel, InputOutputTextPair
diff --git a/litellm/tests/test_exceptions.py b/litellm/tests/test_exceptions.py
index 3367bfa16..396cdc079 100644
--- a/litellm/tests/test_exceptions.py
+++ b/litellm/tests/test_exceptions.py
@@ -144,42 +144,40 @@ def invalid_auth(model):  # set the model key to an invalid key, depending on th
 
 
 invalid_auth(test_model)
+
 # Test 3: Rate Limit Errors
-def test_model(model):
-    try:
-        sample_text = "how does a court case get to the Supreme Court?" * 50000
-        messages = [{ "content": sample_text,"role": "user"}]
-        custom_llm_provider = None
-        if model == "chatgpt-test":
-            custom_llm_provider = "azure"
-        print(f"model: {model}")
-        response = completion(model=model, messages=messages, custom_llm_provider=custom_llm_provider)
-    except RateLimitError:
-        return True
-    except OpenAIError: # is at least an openai error -> in case of random model errors - e.g. overloaded server
-        return True
-    except Exception as e:
-        print(f"Uncaught Exception {model}: {type(e).__name__} - {e}")
-        pass
-    return False
+# def test_model_call(model):
+#     try:
+#         sample_text = "how does a court case get to the Supreme Court?"
+#         messages = [{ "content": sample_text,"role": "user"}]
+#         print(f"model: {model}")
+#         response = completion(model=model, messages=messages)
+#     except RateLimitError:
+#         return True
+#     except OpenAIError: # is at least an openai error -> in case of random model errors - e.g. overloaded server
+#         return True
+#     except Exception as e:
+#         print(f"Uncaught Exception {model}: {type(e).__name__} - {e}")
+#         traceback.print_exc()
+#         pass
+#     return False
+# # Repeat each model 500 times
+# extended_models = [model for model in models for _ in range(250)]
 
-# Repeat each model 500 times
-extended_models = [model for model in models for _ in range(250)]
+# def worker(model):
+#     return test_model_call(model)
 
-def worker(model):
-    return test_model(model)
+# # Create a dictionary to store the results
+# counts = {True: 0, False: 0}
 
-# Create a dictionary to store the results
-counts = {True: 0, False: 0}
+# # Use Thread Pool Executor
+# with ThreadPoolExecutor(max_workers=500) as executor:
+#     # Use map to start the operation in thread pool
+#     results = executor.map(worker, extended_models)
 
-# Use Thread Pool Executor
-with ThreadPoolExecutor(max_workers=500) as executor:
-    # Use map to start the operation in thread pool
-    results = executor.map(worker, extended_models)
+#     # Iterate over results and count True/False
+#     for result in results:
+#         counts[result] += 1
 
-    # Iterate over results and count True/False
-    for result in results:
-        counts[result] += 1
-
-accuracy_score = counts[True]/(counts[True] + counts[False])
-print(f"accuracy_score: {accuracy_score}")
+# accuracy_score = counts[True]/(counts[True] + counts[False])
+# print(f"accuracy_score: {accuracy_score}")
diff --git a/litellm/utils.py b/litellm/utils.py
index 777c9f3b6..994b4916c 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -1451,30 +1451,36 @@ def exception_type(model, original_exception, custom_llm_provider):
                 if "error" in error_response and "`inputs` tokens + `max_new_tokens` must be <=" in error_response["error"]:
                     exception_mapping_worked = True
                     raise ContextWindowExceededError(
-                        message=error_response["error"],
+                        message=f"TogetherAIException - {error_response['error']}",
                         model=model,
                         llm_provider="together_ai"
                     )
                 elif "error" in error_response and "invalid private key" in error_response["error"]:
                     exception_mapping_worked = True
                     raise AuthenticationError(
-                        message=error_response["error"],
+                        message=f"TogetherAIException - {error_response['error']}",
                         llm_provider="together_ai"
                     )
                 elif "error" in error_response and "INVALID_ARGUMENT" in error_response["error"]:
                     exception_mapping_worked = True
                     raise InvalidRequestError(
-                        message=error_response["error"],
+                        message=f"TogetherAIException - {error_response['error']}",
                         model=model,
                         llm_provider="together_ai"
                     )
                 elif "error_type" in error_response and error_response["error_type"] == "validation":
                     exception_mapping_worked = True
                     raise InvalidRequestError(
-                        message=error_response["error"],
+                        message=f"TogetherAIException - {error_response['error']}",
                         model=model,
                         llm_provider="together_ai"
                     )
+                elif original_exception.status_code == 429:
+                        exception_mapping_worked = True
+                        raise RateLimitError(
+                            message=f"TogetherAIException - {original_exception.message}",
+                            llm_provider="together_ai",
+                        )
                 print(f"error: {error_response}")
                 print(f"e: {original_exception}")
             raise original_exception  # base case - return the original exception