diff --git a/litellm/__init__.py b/litellm/__init__.py
index ec3ea31651..882994184e 100644
--- a/litellm/__init__.py
+++ b/litellm/__init__.py
@@ -29,6 +29,7 @@ vertex_location: Optional[str] = None
 togetherai_api_key: Optional[str] = None
 baseten_key: Optional[str] = None
 aleph_alpha_key: Optional[str] = None
+nlp_cloud_key: Optional[str] = None
 use_client = False
 logging = True
 caching = False # deprecated son
@@ -152,6 +153,8 @@ huggingface_models = [
 
 ai21_models = ["j2-ultra", "j2-mid", "j2-light"]
 
+nlp_cloud_models = ["dolphin", "chatdolphin"]
+
 together_ai_models = [
     "togethercomputer/llama-2-70b-chat",
     "togethercomputer/Llama-2-7B-32K-Instruct",
@@ -183,6 +186,7 @@ model_list = (
     + together_ai_models
     + baseten_models
     + aleph_alpha_models
+    + nlp_cloud_models
 )
 
 provider_list = [
@@ -200,6 +204,7 @@ provider_list = [
     "sagemaker",
     "bedrock",
     "vllm",
+    "nlp_cloud",
     "custom", # custom apis
 ]
 
diff --git a/litellm/__pycache__/__init__.cpython-311.pyc b/litellm/__pycache__/__init__.cpython-311.pyc
index b5b19892ba..f687fe5eca 100644
Binary files a/litellm/__pycache__/__init__.cpython-311.pyc and b/litellm/__pycache__/__init__.cpython-311.pyc differ
diff --git a/litellm/__pycache__/main.cpython-311.pyc b/litellm/__pycache__/main.cpython-311.pyc
index 7d1b0da0d7..57f2e449a6 100644
Binary files a/litellm/__pycache__/main.cpython-311.pyc and b/litellm/__pycache__/main.cpython-311.pyc differ
diff --git a/litellm/__pycache__/utils.cpython-311.pyc b/litellm/__pycache__/utils.cpython-311.pyc
index 1993162724..0062150af5 100644
Binary files a/litellm/__pycache__/utils.cpython-311.pyc and b/litellm/__pycache__/utils.cpython-311.pyc differ
diff --git a/litellm/llms/nlp_cloud.py b/litellm/llms/nlp_cloud.py
new file mode 100644
index 0000000000..8dd2e02309
--- /dev/null
+++ b/litellm/llms/nlp_cloud.py
@@ -0,0 +1,103 @@
+import os
+import json
+from enum import Enum
+import requests
+import time
+from typing import Callable
+from litellm.utils import ModelResponse
+
+class NLPCloudError(Exception):
+    def __init__(self, status_code, message):
+        self.status_code = status_code
+        self.message = message
+        super().__init__(
+            self.message
+        )  # Call the base class constructor with the parameters it needs
+
+def validate_environment(api_key):
+    headers = {
+        "accept": "application/json",
+        "content-type": "application/json",
+    }
+    if api_key:
+        headers["Authorization"] = f"Token {api_key}"
+    return headers
+
+def completion(
+    model: str,
+    messages: list,
+    model_response: ModelResponse,
+    print_verbose: Callable,
+    encoding,
+    api_key,
+    logging_obj,
+    optional_params=None,
+    litellm_params=None,
+    logger_fn=None,
+    default_max_tokens_to_sample=None,
+):
+    headers = validate_environment(api_key)
+    completion_url_fragment_1 = "https://api.nlpcloud.io/v1/gpu/"
+    completion_url_fragment_2 = "/generation"
+    model = model
+    text = " ".join(message["content"] for message in messages)
+
+    data = {
+        "text": text,
+        **optional_params,
+    }
+
+    completion_url = completion_url_fragment_1 + model + completion_url_fragment_2
+    ## LOGGING
+    logging_obj.pre_call(
+            input=text,
+            api_key=api_key,
+            additional_args={"complete_input_dict": data},
+        )
+    ## COMPLETION CALL
+    response = requests.post(
+        completion_url, headers=headers, data=json.dumps(data), stream=optional_params["stream"] if "stream" in optional_params else False
+    )
+    if "stream" in optional_params and optional_params["stream"] == True:
+        return response.iter_lines()
+    else:
+        ## LOGGING
+        logging_obj.post_call(
+                input=text,
+                api_key=api_key,
+                original_response=response.text,
+                additional_args={"complete_input_dict": data},
+            )
+        print_verbose(f"raw model_response: {response.text}")
+        ## RESPONSE OBJECT
+        try:
+            completion_response = response.json()
+        except:
+            raise NLPCloudError(message=response.text, status_code=response.status_code)
+        if "error" in completion_response:
+            raise NLPCloudError(
+                message=completion_response["error"],
+                status_code=response.status_code,
+            )
+        else:
+            try:
+                model_response["choices"][0]["message"]["content"] = completion_response["generated_text"]
+            except:
+                raise NLPCloudError(message=json.dumps(completion_response), status_code=response.status_code)
+
+        ## CALCULATING USAGE - baseten charges on time, not tokens - have some mapping of cost here. 
+        prompt_tokens = completion_response["nb_input_tokens"]
+        completion_tokens = completion_response["nb_generated_tokens"]
+
+        model_response["created"] = time.time()
+        model_response["model"] = model
+        model_response["usage"] = {
+            "prompt_tokens": prompt_tokens,
+            "completion_tokens": completion_tokens,
+            "total_tokens": prompt_tokens + completion_tokens,
+        }
+        return model_response
+
+def embedding():
+    # logic for parsing in - calling - parsing out model embedding calls
+    pass
diff --git a/litellm/main.py b/litellm/main.py
index 243dd891e2..476f83da9b 100644
--- a/litellm/main.py
+++ b/litellm/main.py
@@ -27,6 +27,7 @@ from .llms import bedrock
 from .llms import huggingface_restapi
 from .llms import replicate
 from .llms import aleph_alpha
+from .llms import nlp_cloud
 from .llms import baseten
 from .llms import vllm
 from .llms import ollama
@@ -111,6 +112,7 @@ def completion(
     # model specific optional params
     top_k=40,# used by text-bison only
     task: Optional[str]="text-generation-inference", # used by huggingface inference endpoints
+    remove_input: bool = True, # used by nlp cloud models - prevents input text from being returned as part of output
     request_timeout=0,  # unused var for old version of OpenAI API
     fallbacks=[],
     caching = False,
@@ -154,7 +156,8 @@ def completion(
             model=model,
             custom_llm_provider=custom_llm_provider,
             top_k=top_k,
-            task=task
+            task=task,
+            remove_input=remove_input
         )
         # For logging - save the values of the litellm-specific params passed in
         litellm_params = get_litellm_params(
@@ -421,6 +424,29 @@ def completion(
                 response = CustomStreamWrapper(model_response, model, logging_obj=logging)
                 return response
             response = model_response
+        elif model in litellm.nlp_cloud_models or custom_llm_provider == "nlp_cloud":
+            nlp_cloud_key = (
+                api_key or litellm.nlp_cloud_key or get_secret("NLP_CLOUD_API_KEY") or litellm.api_key
+            )
+
+            model_response = nlp_cloud.completion(
+                model=model,
+                messages=messages,
+                model_response=model_response,
+                print_verbose=print_verbose,
+                optional_params=optional_params,
+                litellm_params=litellm_params,
+                logger_fn=logger_fn,
+                encoding=encoding,
+                api_key=nlp_cloud_key,
+                logging_obj=logging
+            )
+
+            if "stream" in optional_params and optional_params["stream"] == True:
+                # don't try to access stream object,
+                response = CustomStreamWrapper(model_response, model, custom_llm_provider="nlp_cloud", logging_obj=logging)
+                return response
+            response = model_response
         elif model in litellm.aleph_alpha_models:
             aleph_alpha_key = (
                 api_key or litellm.aleph_alpha_key or get_secret("ALEPH_ALPHA_API_KEY") or get_secret("ALEPHALPHA_API_KEY") or litellm.api_key
diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py
index 84336eb88f..98faeb5c2c 100644
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@@ -113,8 +113,38 @@ def test_completion_claude_stream():
         pytest.fail(f"Error occurred: {e}")
 # test_completion_claude_stream()
 
+def test_completion_nlp_cloud():
+    try:
+        messages = [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {
+                "role": "user",
+                "content": "how does a court case get to the Supreme Court?",
+            },
+        ]
+        response = completion(model="dolphin", messages=messages, logger_fn=logger_fn)
+        print(response)
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
 
+def test_completion_nlp_cloud_streaming():
+    try:
+        messages = [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {
+                "role": "user",
+                "content": "how does a court case get to the Supreme Court?",
+            },
+        ]
+        response = completion(model="dolphin", messages=messages, stream=True, logger_fn=logger_fn)
+        # Add any assertions here to check the response
+        for chunk in response:
+            print(chunk["choices"][0]["delta"]["content"])  # same as openai format
+            print(chunk["choices"][0]["finish_reason"])
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
 
+test_completion_nlp_cloud_streaming()
 # def test_completion_hf_api():
 #     try:
 #         user_message = "write some code to find the sum of two numbers"
diff --git a/litellm/tests/test_exceptions.py b/litellm/tests/test_exceptions.py
index f08705e41c..6320b7131d 100644
--- a/litellm/tests/test_exceptions.py
+++ b/litellm/tests/test_exceptions.py
@@ -75,6 +75,9 @@ def invalid_auth(model):  # set the model key to an invalid key, depending on th
         elif model in litellm.aleph_alpha_models:
             temporary_key = os.environ["ALEPH_ALPHA_API_KEY"]
             os.environ["ALEPH_ALPHA_API_KEY"] = "bad-key"
+        elif model in litellm.nlp_cloud_models:
+            temporary_key = os.environ["NLP_CLOUD_API_KEY"]
+            os.environ["NLP_CLOUD_API_KEY"] = "bad-key"
         elif (
             model
             == "replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1"
@@ -119,6 +122,8 @@ def invalid_auth(model):  # set the model key to an invalid key, depending on th
             os.environ["TOGETHERAI_API_KEY"] = temporary_key
         elif model in litellm.aleph_alpha_models:
             os.environ["ALEPH_ALPHA_API_KEY"] = temporary_key
+        elif model in litellm.nlp_cloud_models:
+            os.environ["NLP_CLOUD_API_KEY"] = temporary_key
     return
 
 # Test 3: Invalid Request Error 
diff --git a/litellm/utils.py b/litellm/utils.py
index c5f35cfde4..ff7a8588c5 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -783,6 +783,7 @@ def get_optional_params(  # use the openai defaults
     frequency_penalty=0,
     logit_bias={},
     num_beams=1,
+    remove_input=False, # for nlp_cloud
     user="",
     deployment_id=None,
     model=None,
@@ -917,6 +918,29 @@ def get_optional_params(  # use the openai defaults
             optional_params["n"] = n
         if stop != None:
             optional_params["stop_sequences"] = stop
+    elif model in litellm.nlp_cloud_models or custom_llm_provider == "nlp_cloud":
+        if max_tokens != float("inf"):
+            optional_params["max_length"] = max_tokens
+        if stream:
+            optional_params["stream"] = stream
+        if temperature != 1:
+            optional_params["temperature"] = temperature
+        if top_k != 40:
+            optional_params["top_k"] = top_k
+        if top_p != 1:
+            optional_params["top_p"] = top_p
+        if presence_penalty != 0:
+            optional_params["presence_penalty"] = presence_penalty
+        if frequency_penalty != 0:
+            optional_params["frequency_penalty"] = frequency_penalty
+        if num_beams != 1:
+            optional_params["num_beams"] = num_beams
+        if n != 1:
+            optional_params["num_return_sequences"] = n
+        if remove_input == True:
+            optional_params["remove_input"] = True
+        if stop != None:
+            optional_params["stop_sequences"] = stop
     else:  # assume passing in params for openai/azure openai
         if functions != []:
             optional_params["functions"] = functions
@@ -993,6 +1017,9 @@ def get_llm_provider(model: str, custom_llm_provider: Optional[str] = None):
         ## baseten 
         elif model in litellm.baseten_models:
             custom_llm_provider = "baseten"
+        ## nlp_cloud
+        elif model in litellm.nlp_cloud_models:
+            custom_llm_provider = "nlp_cloud"
         
         if custom_llm_provider is None or custom_llm_provider=="":
             raise ValueError(f"LLM Provider NOT provided. Pass in the LLM provider you are trying to call. E.g. For 'Huggingface' inference endpoints pass in `completion(model='huggingface/{model}',..)` Learn more: https://docs.litellm.ai/docs/providers")
@@ -1968,6 +1995,81 @@ def exception_type(model, original_exception, custom_llm_provider):
                             llm_provider="ai21",
                             model=model
                         )
+            elif model in litellm.nlp_cloud_models or custom_llm_provider == "nlp_cloud":
+                if "detail" in error_str:
+                    if "Input text length should not exceed" in error_str:
+                        exception_mapping_worked = True
+                        raise ContextWindowExceededError(
+                            message=f"NLPCloudException - {error_str}",
+                            model=model,
+                            llm_provider="nlp_cloud"
+                        )
+                    elif "value is not a valid" in error_str:
+                        exception_mapping_worked = True
+                        raise InvalidRequestError(
+                            message=f"NLPCloudException - {error_str}",
+                            model=model,
+                            llm_provider="nlp_cloud"
+                        )
+                    else: 
+                        exception_mapping_worked = True
+                        raise APIError(
+                            status_code=500,
+                            message=f"NLPCloudException - {error_str}",
+                            model=model,
+                            llm_provider="nlp_cloud"
+                        )
+                if hasattr(original_exception, "status_code"): # https://docs.nlpcloud.com/?shell#errors
+                    if original_exception.status_code == 400 or original_exception.status_code == 406 or original_exception.status_code == 413 or original_exception.status_code == 422:
+                        exception_mapping_worked = True
+                        raise InvalidRequestError(
+                            message=f"NLPCloudException - {original_exception.message}",
+                            llm_provider="nlp_cloud",
+                            model=model
+                        )
+                    elif original_exception.status_code == 401 or original_exception.status_code == 403:
+                        exception_mapping_worked = True
+                        raise AuthenticationError(
+                            message=f"NLPCloudException - {original_exception.message}",
+                            llm_provider="nlp_cloud",
+                            model=model
+                        )
+                    elif original_exception.status_code == 522 or original_exception.status_code == 524:
+                        exception_mapping_worked = True
+                        raise Timeout(
+                            message=f"NLPCloudException - {original_exception.message}",
+                            model=model,
+                            llm_provider="nlp_cloud"
+                        )
+                    elif original_exception.status_code == 429 or original_exception.status_code == 402:
+                        exception_mapping_worked = True
+                        raise RateLimitError(
+                            message=f"NLPCloudException - {original_exception.message}",
+                            llm_provider="nlp_cloud",
+                        )
+                    elif original_exception.status_code == 500 or original_exception.status_code == 503:
+                        exception_mapping_worked = True
+                        raise APIError(
+                            status_code=original_exception.status_code, 
+                            message=f"NLPCloudException - {original_exception.message}",
+                            llm_provider="nlp_cloud",
+                            model=model
+                        )
+                    elif original_exception.status_code == 504 or original_exception.status_code == 520:
+                        exception_mapping_worked = True
+                        raise ServiceUnavailableError(
+                            message=f"NLPCloudException - {original_exception.message}",
+                            model=model,
+                            llm_provider="nlp_cloud"
+                        )
+                    else:
+                        exception_mapping_worked = True
+                        raise APIError(
+                            status_code=original_exception.status_code, 
+                            message=f"NLPCloudException - {original_exception.message}",
+                            llm_provider="nlp_cloud",
+                            model=model
+                        )
             elif custom_llm_provider == "together_ai":
                 error_response = json.loads(error_str)
                 if "error" in error_response and "`inputs` tokens + `max_new_tokens` must be <=" in error_response["error"]:
@@ -2240,6 +2342,15 @@ class CustomStreamWrapper:
         except:
             raise ValueError(f"Unable to parse response. Original response: {chunk}")
     
+    def handle_nlp_cloud_chunk(self, chunk):
+        chunk = chunk.decode("utf-8")
+        data_json = json.loads(chunk)
+        try:
+            print(f"data json: {data_json}")
+            return data_json["generated_text"]
+        except:
+            raise ValueError(f"Unable to parse response. Original response: {chunk}")
+    
     def handle_aleph_alpha_chunk(self, chunk):
         chunk = chunk.decode("utf-8")
         data_json = json.loads(chunk)
@@ -2320,12 +2431,15 @@ class CustomStreamWrapper:
             elif self.custom_llm_provider and self.custom_llm_provider == "vllm":
                 chunk = next(self.completion_stream)
                 completion_obj["content"] = chunk[0].outputs[0].text
-            elif self.model in litellm.aleph_alpha_models: #ai21 doesn't provide streaming
+            elif self.model in litellm.aleph_alpha_models: #aleph alpha doesn't provide streaming
                 chunk = next(self.completion_stream)
                 completion_obj["content"] = self.handle_aleph_alpha_chunk(chunk)
             elif self.model in litellm.open_ai_text_completion_models:
                 chunk = next(self.completion_stream)
                 completion_obj["content"] = self.handle_openai_text_completion_chunk(chunk)
+            elif self.model in litellm.nlp_cloud_models or self.custom_llm_provider == "nlp_cloud":
+                chunk = next(self.completion_stream)
+                completion_obj["content"] = self.handle_nlp_cloud_chunk(chunk)
             else: # openai chat/azure models
                 chunk = next(self.completion_stream)
                 return chunk # open ai returns finish_reason, we should just return the openai chunk
diff --git a/pyproject.toml b/pyproject.toml
index e543b97bd6..17e65d90b7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "0.1.623"
+version = "0.1.624"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT License"