diff --git a/litellm/__pycache__/main.cpython-311.pyc b/litellm/__pycache__/main.cpython-311.pyc
index 6b42eec17..3b91b0fb3 100644
Binary files a/litellm/__pycache__/main.cpython-311.pyc and b/litellm/__pycache__/main.cpython-311.pyc differ
diff --git a/litellm/__pycache__/utils.cpython-311.pyc b/litellm/__pycache__/utils.cpython-311.pyc
index fbebc9cd5..76b82a67d 100644
Binary files a/litellm/__pycache__/utils.cpython-311.pyc and b/litellm/__pycache__/utils.cpython-311.pyc differ
diff --git a/litellm/llms/cohere.py b/litellm/llms/cohere.py
new file mode 100644
index 000000000..113b6b542
--- /dev/null
+++ b/litellm/llms/cohere.py
@@ -0,0 +1,101 @@
+import os
+import json
+from enum import Enum
+import requests
+import time
+from typing import Callable
+from litellm.utils import ModelResponse
+
+class CohereError(Exception):
+    def __init__(self, status_code, message):
+        self.status_code = status_code
+        self.message = message
+        super().__init__(
+            self.message
+        )  # Call the base class constructor with the parameters it needs
+
+def validate_environment(api_key):
+    headers = {
+        "accept": "application/json",
+        "content-type": "application/json",
+    }
+    if api_key:
+        headers["Authorization"] = f"Bearer {api_key}"
+    return headers
+
+def completion(
+    model: str,
+    messages: list,
+    model_response: ModelResponse,
+    print_verbose: Callable,
+    encoding,
+    api_key,
+    logging_obj,
+    optional_params=None,
+    litellm_params=None,
+    logger_fn=None,
+):
+    headers = validate_environment(api_key)
+    completion_url = "https://api.cohere.ai/v1/generate"
+    model = model
+    prompt = " ".join(message["content"] for message in messages)
+    data = {
+        "model": model,
+        "prompt": prompt,
+        **optional_params,
+    }
+
+    ## LOGGING
+    logging_obj.pre_call(
+            input=prompt,
+            api_key=api_key,
+            additional_args={"complete_input_dict": data},
+        )
+    ## COMPLETION CALL
+    response = requests.post(
+        completion_url, headers=headers, data=json.dumps(data), stream=optional_params["stream"] if "stream" in optional_params else False
+    )
+    if "stream" in optional_params and optional_params["stream"] == True:
+        return response.iter_lines()
+    else:
+        ## LOGGING
+        logging_obj.post_call(
+                input=prompt,
+                api_key=api_key,
+                original_response=response.text,
+                additional_args={"complete_input_dict": data},
+            )
+        print_verbose(f"raw model_response: {response.text}")
+        ## RESPONSE OBJECT
+        completion_response = response.json()
+        if "error" in completion_response:
+            raise CohereError(
+                message=completion_response["error"],
+                status_code=response.status_code,
+            )
+        else:
+            try:
+                model_response["choices"][0]["message"]["content"] = completion_response["generations"][0]["text"]
+            except:
+                raise CohereError(message=json.dumps(completion_response), status_code=response.status_code)
+
+        ## CALCULATING USAGE - baseten charges on time, not tokens - have some mapping of cost here. 
+        prompt_tokens = len(
+            encoding.encode(prompt)
+        ) 
+        completion_tokens = len(
+            encoding.encode(model_response["choices"][0]["message"]["content"])
+        )
+
+        model_response["created"] = time.time()
+        model_response["model"] = model
+        model_response["usage"] = {
+            "prompt_tokens": prompt_tokens,
+            "completion_tokens": completion_tokens,
+            "total_tokens": prompt_tokens + completion_tokens,
+        }
+        return model_response
+
+def embedding():
+    # logic for parsing in - calling - parsing out model embedding calls
+    pass
diff --git a/litellm/main.py b/litellm/main.py
index 9edf423e0..26d54cf67 100644
--- a/litellm/main.py
+++ b/litellm/main.py
@@ -32,6 +32,7 @@ from .llms import nlp_cloud
 from .llms import baseten
 from .llms import vllm
 from .llms import ollama
+from .llms import cohere
 import tiktoken
 from concurrent.futures import ThreadPoolExecutor
 from typing import Callable, List, Optional, Dict
@@ -547,12 +548,6 @@ def completion(
                 input=messages, api_key=openai.api_key, original_response=response
             )
         elif model in litellm.cohere_models:
-            # import cohere/if it fails then pip install cohere
-            try:
-                import cohere
-            except:
-                raise Exception("Cohere import failed please run `pip install cohere`")
-
             cohere_key = (
                 api_key
                 or litellm.cohere_key
@@ -560,35 +555,23 @@ def completion(
                 or get_secret("CO_API_KEY")
                 or litellm.api_key
             )
-            co = cohere.Client(cohere_key)
-            prompt = " ".join([message["content"] for message in messages])
-            ## LOGGING
-            logging.pre_call(input=prompt, api_key=cohere_key)
-            ## COMPLETION CALL
-            response = co.generate(model=model, prompt=prompt, **optional_params)
+            model_response = cohere.completion(
+                model=model,
+                messages=messages,
+                model_response=model_response,
+                print_verbose=print_verbose,
+                optional_params=optional_params,
+                litellm_params=litellm_params,
+                logger_fn=logger_fn,
+                encoding=encoding,
+                api_key=cohere_key,
+                logging_obj=logging # model call logging done inside the class as we make need to modify I/O to fit aleph alpha's requirements
+            )
+
             if "stream" in optional_params and optional_params["stream"] == True:
                 # don't try to access stream object,
-                response = CustomStreamWrapper(response, model, logging_obj=logging)
+                response = CustomStreamWrapper(model_response, model, logging_obj=logging)
                 return response
-            ## LOGGING
-            logging.post_call(
-                input=prompt, api_key=cohere_key, original_response=response
-            )
-            ## USAGE
-            completion_response = response[0].text
-            prompt_tokens = len(encoding.encode(prompt))
-            completion_tokens = len(encoding.encode(completion_response))
-            ## RESPONSE OBJECT
-            model_response["choices"][0]["message"]["content"] = completion_response
-            if response[0].finish_reason:
-                model_response.choices[0].finish_reason = response[0].finish_reason
-            model_response["created"] = time.time()
-            model_response["model"] = model
-            model_response["usage"] = {
-                "prompt_tokens": prompt_tokens,
-                "completion_tokens": completion_tokens,
-                "total_tokens": prompt_tokens + completion_tokens,
-            }
             response = model_response
         elif (
             (
diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py
index 98faeb5c2..e8da75a8c 100644
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@@ -144,7 +144,7 @@ def test_completion_nlp_cloud_streaming():
     except Exception as e:
         pytest.fail(f"Error occurred: {e}")
 
-test_completion_nlp_cloud_streaming()
+# test_completion_nlp_cloud_streaming()
 # def test_completion_hf_api():
 #     try:
 #         user_message = "write some code to find the sum of two numbers"
@@ -183,7 +183,6 @@ def test_completion_cohere(): # commenting for now as the cohere endpoint is bei
         # Add any assertions here to check the response
         print(response)
         response_str = response["choices"][0]["message"]["content"]
-        print(f"str response{response_str}")
         response_str_2 = response.choices[0].message.content
         if type(response_str) != str:
             pytest.fail(f"Error occurred: {e}")
@@ -192,6 +191,8 @@ def test_completion_cohere(): # commenting for now as the cohere endpoint is bei
     except Exception as e:
         pytest.fail(f"Error occurred: {e}")
 
+# test_completion_cohere()
+
 def test_completion_cohere_stream():
     try:
         messages = [
diff --git a/litellm/tests/test_streaming.py b/litellm/tests/test_streaming.py
index c577567b7..43308aa16 100644
--- a/litellm/tests/test_streaming.py
+++ b/litellm/tests/test_streaming.py
@@ -3,7 +3,7 @@
 
 import sys, os, asyncio
 import traceback
-import time
+import time, pytest
 
 sys.path.insert(
     0, os.path.abspath("../..")
@@ -24,6 +24,30 @@ def logger_fn(model_call_object: dict):
 user_message = "Hello, how are you?"
 messages = [{"content": user_message, "role": "user"}]
 
+def test_completion_cohere_stream():
+    try:
+        messages = [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {
+                "role": "user",
+                "content": "how does a court case get to the Supreme Court?",
+            },
+        ]
+        response = completion(
+            model="command-nightly", messages=messages, stream=True, max_tokens=50
+        )
+        complete_response = ""
+        # Add any assertions here to check the response
+        for chunk in response:
+            print(f"chunk: {chunk}")
+            complete_response += chunk["choices"][0]["delta"]["content"]
+        if complete_response == "": 
+            raise Exception("Empty response received")
+        print(f"completion_response: {complete_response}")
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+
+
 # test on baseten completion call
 # try:
 #     response = completion(
diff --git a/litellm/utils.py b/litellm/utils.py
index 06149c83e..e0c29896e 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -1854,7 +1854,7 @@ def exception_type(model, original_exception, custom_llm_provider):
                     llm_provider="replicate",
                     model=model
                 )
-            elif model in litellm.cohere_models:  # Cohere
+            elif model in litellm.cohere_models or custom_llm_provider == "cohere":  # Cohere
                 if (
                     "invalid api token" in error_str
                     or "No API key provided." in error_str
@@ -1872,6 +1872,21 @@ def exception_type(model, original_exception, custom_llm_provider):
                         model=model,
                         llm_provider="cohere",
                     )
+                elif hasattr(original_exception, "status_code"):
+                    if original_exception.status_code == 400 or original_exception.status_code == 498:
+                        exception_mapping_worked = True
+                        raise InvalidRequestError(
+                            message=f"CohereException - {original_exception.message}",
+                            llm_provider="cohere",
+                            model=model
+                        )
+                    elif original_exception.status_code == 500:
+                        exception_mapping_worked = True
+                        raise ServiceUnavailableError(
+                            message=f"CohereException - {original_exception.message}",
+                            llm_provider="cohere",
+                            model=model
+                        )
                 elif (
                     "CohereConnectionError" in exception_type
                 ):  # cohere seems to fire these errors when we load test it (1k+ messages / min)
@@ -2287,14 +2302,10 @@ class CustomStreamWrapper:
         self.model = model
         self.custom_llm_provider = custom_llm_provider
         self.logging_obj = logging_obj
+        self.completion_stream = completion_stream
         if self.logging_obj:
                 # Log the type of the received item
                 self.logging_obj.post_call(str(type(completion_stream)))
-        if model in litellm.cohere_models:
-            # these do not return an iterator, so we need to wrap it in one
-            self.completion_stream = iter(completion_stream)
-        else:
-            self.completion_stream = completion_stream
 
     def __iter__(self):
         return self
@@ -2359,6 +2370,16 @@ class CustomStreamWrapper:
         except:
             raise ValueError(f"Unable to parse response. Original response: {chunk}")
     
+    def handle_cohere_chunk(self, chunk):
+        chunk = chunk.decode("utf-8")
+        print(f"cohere chunk: {chunk}")
+        data_json = json.loads(chunk)
+        try:
+            print(f"data json: {data_json}")
+            return data_json["text"]
+        except:
+            raise ValueError(f"Unable to parse response. Original response: {chunk}")
+    
     def handle_openai_text_completion_chunk(self, chunk):
         try:
             return chunk["choices"][0]["text"]
@@ -2416,9 +2437,6 @@ class CustomStreamWrapper:
                 if text_data == "":
                     return self.__next__()
                 completion_obj["content"] = text_data
-            elif self.model in litellm.cohere_models:
-                chunk = next(self.completion_stream)
-                completion_obj["content"] = chunk.text
             elif self.custom_llm_provider and self.custom_llm_provider == "huggingface":
                 chunk = next(self.completion_stream)
                 completion_obj["content"] = self.handle_huggingface_chunk(chunk)
@@ -2440,6 +2458,9 @@ class CustomStreamWrapper:
             elif self.model in litellm.nlp_cloud_models or self.custom_llm_provider == "nlp_cloud":
                 chunk = next(self.completion_stream)
                 completion_obj["content"] = self.handle_nlp_cloud_chunk(chunk)
+            elif self.model in litellm.cohere_models or self.custom_llm_provider == "cohere":
+                chunk = next(self.completion_stream)
+                completion_obj["content"] = self.handle_cohere_chunk(chunk)
             else: # openai chat/azure models
                 chunk = next(self.completion_stream)
                 return chunk # open ai returns finish_reason, we should just return the openai chunk
diff --git a/pyproject.toml b/pyproject.toml
index 5d259cdc3..9a81f0f9d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "0.1.625"
+version = "0.1.626"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT License"