diff --git a/litellm/__pycache__/main.cpython-311.pyc b/litellm/__pycache__/main.cpython-311.pyc
index 63d8dc73e..7d1b0da0d 100644
Binary files a/litellm/__pycache__/main.cpython-311.pyc and b/litellm/__pycache__/main.cpython-311.pyc differ
diff --git a/litellm/__pycache__/utils.cpython-311.pyc b/litellm/__pycache__/utils.cpython-311.pyc
index e12757bfe..199316272 100644
Binary files a/litellm/__pycache__/utils.cpython-311.pyc and b/litellm/__pycache__/utils.cpython-311.pyc differ
diff --git a/litellm/llms/ai21.py b/litellm/llms/ai21.py
index f3f4a4342..17d5c9bd9 100644
--- a/litellm/llms/ai21.py
+++ b/litellm/llms/ai21.py
@@ -90,7 +90,8 @@ def completion(
         else:
             try:
                 model_response["choices"][0]["message"]["content"] = completion_response["completions"][0]["data"]["text"]
-            except:
+                model_response.choices[0].finish_reason = completion_response["completions"][0]["finishReason"]["reason"]
+            except Exception as e:
                 raise AI21Error(message=json.dumps(completion_response), status_code=response.status_code)
 
         ## CALCULATING USAGE - baseten charges on time, not tokens - have some mapping of cost here. 
diff --git a/litellm/llms/anthropic.py b/litellm/llms/anthropic.py
index 021ec4a73..e1634afe0 100644
--- a/litellm/llms/anthropic.py
+++ b/litellm/llms/anthropic.py
@@ -114,6 +114,7 @@ def completion(
             model_response["choices"][0]["message"]["content"] = completion_response[
                 "completion"
             ]
+            model_response.choices[0].finish_reason = completion_response["stop_reason"]
 
         ## CALCULATING USAGE
         prompt_tokens = len(
diff --git a/litellm/llms/huggingface_restapi.py b/litellm/llms/huggingface_restapi.py
index e2fccb569..1160e6d8d 100644
--- a/litellm/llms/huggingface_restapi.py
+++ b/litellm/llms/huggingface_restapi.py
@@ -153,9 +153,10 @@ def completion(
             elif task == "text-generation-inference": 
                 model_response["choices"][0]["message"][
                     "content"
-                ] = completion_response[0]["generated_text"]    
-                ## GETTING LOGPROBS 
+                ] = completion_response[0]["generated_text"]   
+                ## GETTING LOGPROBS + FINISH REASON 
                 if "details" in completion_response[0] and "tokens" in completion_response[0]["details"]:
+                    model_response.choices[0].finish_reason = completion_response[0]["details"]["finish_reason"]
                     sum_logprob = 0
                     for token in completion_response[0]["details"]["tokens"]:
                         sum_logprob += token["logprob"]
diff --git a/litellm/llms/together_ai.py b/litellm/llms/together_ai.py
index 4f75e6e43..47d6ab677 100644
--- a/litellm/llms/together_ai.py
+++ b/litellm/llms/together_ai.py
@@ -104,14 +104,17 @@ def completion(
                 message=json.dumps(completion_response["output"]), status_code=response.status_code
             )
 
-        completion_response = completion_response["output"]["choices"][0]["text"]
+        print(completion_response)
+        completion_text = completion_response["output"]["choices"][0]["text"]
 
         ## CALCULATING USAGE - baseten charges on time, not tokens - have some mapping of cost here.
         prompt_tokens = len(encoding.encode(prompt))
         completion_tokens = len(
-            encoding.encode(completion_response)
+            encoding.encode(completion_text)
         )
-        model_response["choices"][0]["message"]["content"] = completion_response
+        model_response["choices"][0]["message"]["content"] = completion_text
+        if "finish_reason" in completion_response["output"]["choices"][0]:
+            model_response.choices[0].finish_reason = completion_response["output"]["choices"][0]["finish_reason"]
         model_response["created"] = time.time()
         model_response["model"] = model
         model_response["usage"] = {
diff --git a/litellm/main.py b/litellm/main.py
index a7d9d627b..46129c7be 100644
--- a/litellm/main.py
+++ b/litellm/main.py
@@ -529,6 +529,8 @@ def completion(
             completion_tokens = len(encoding.encode(completion_response))
             ## RESPONSE OBJECT
             model_response["choices"][0]["message"]["content"] = completion_response
+            if response[0].finish_reason:
+                model_response.choices[0].finish_reason = response[0].finish_reason
             model_response["created"] = time.time()
             model_response["model"] = model
             model_response["usage"] = {
diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py
index 6be7f24d3..934354c2c 100644
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@@ -49,7 +49,7 @@ def test_completion_claude():
         print(response)
     except Exception as e:
         pytest.fail(f"Error occurred: {e}")
-
+# test_completion_claude()
 # aleph alpha
 # def test_completion_aleph_alpha():
 #     try:
@@ -119,8 +119,8 @@ def test_completion_claude_stream():
 #     try:
 #         user_message = "write some code to find the sum of two numbers"
 #         messages = [{ "content": user_message,"role": "user"}]
-#         api_base = "https://wyh9bqfgj2r1klv5.us-east-1.aws.endpoints.huggingface.cloud"
-#         response = completion(model="facebook/blenderbot-400M-distill", messages=messages, custom_llm_provider="huggingface", task="conversational", api_base=api_base, logger_fn=logger_fn)
+#         api_base = "https://ecd4sb5n09bo4ei2.us-east-1.aws.endpoints.huggingface.cloud"
+#         response = completion(model="togethercomputer/LLaMA-2-7B-32K", messages=messages, custom_llm_provider="huggingface", api_base=api_base, logger_fn=logger_fn)
 #         # Add any assertions here to check the response
 #         print(response)
 #     except Exception as e:
@@ -141,26 +141,26 @@ def test_completion_claude_stream():
 #         pytest.fail(f"Error occurred: {e}")
 
 
-# def test_completion_cohere(): # commenting for now as the cohere endpoint is being flaky
-#     try:
-#         response = completion(
-#             model="command-nightly",
-#             messages=messages,
-#             max_tokens=100,
-#             logit_bias={40: 10},
-#         )
-#         # Add any assertions here to check the response
-#         print(response)
-#         response_str = response["choices"][0]["message"]["content"]
-#         print(f"str response{response_str}")
-#         response_str_2 = response.choices[0].message.content
-#         if type(response_str) != str:
-#             pytest.fail(f"Error occurred: {e}")
-#         if type(response_str_2) != str:
-#             pytest.fail(f"Error occurred: {e}")
-#     except Exception as e:
-#         pytest.fail(f"Error occurred: {e}")
-## 
+def test_completion_cohere(): # commenting for now as the cohere endpoint is being flaky
+    try:
+        response = completion(
+            model="command-nightly",
+            messages=messages,
+            max_tokens=100,
+            logit_bias={40: 10},
+            logger_fn=logger_fn
+        )
+        # Add any assertions here to check the response
+        print(response)
+        response_str = response["choices"][0]["message"]["content"]
+        print(f"str response{response_str}")
+        response_str_2 = response.choices[0].message.content
+        if type(response_str) != str:
+            pytest.fail(f"Error occurred: {e}")
+        if type(response_str_2) != str:
+            pytest.fail(f"Error occurred: {e}")
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
 
 def test_completion_cohere_stream():
     try:
@@ -750,15 +750,16 @@ def test_completion_with_fallbacks():
 
 
 #### Test A121 ###################
-# def test_completion_ai21():
-#     model_name = "j2-light"
-#     try:
-#         response = completion(model=model_name, messages=messages)
-#         # Add any assertions here to check the response
-#         print(response)
-#     except Exception as e:
-#         pytest.fail(f"Error occurred: {e}")
+def test_completion_ai21():
+    model_name = "j2-light"
+    try:
+        response = completion(model=model_name, messages=messages)
+        # Add any assertions here to check the response
+        print(response)
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
 
+# test_completion_ai21()
 # test config file with completion #
 # def test_completion_openai_config():
 #     try:
diff --git a/litellm/utils.py b/litellm/utils.py
index fe3efe06b..c5f35cfde 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -830,7 +830,23 @@ def get_optional_params(  # use the openai defaults
             optional_params["top_k"] = top_k
         if stop != None:
             optional_params["stop_sequences"] = stop
-
+    elif custom_llm_provider == "huggingface":
+        if temperature != 1:
+            optional_params["temperature"] = temperature
+        if top_p != 1:
+            optional_params["top_p"] = top_p
+        if n != 1:
+            optional_params["n"] = n
+        if stream:
+            optional_params["stream"] = stream
+        if stop != None:
+            optional_params["stop"] = stop
+        if max_tokens != float("inf"):
+            optional_params["max_new_tokens"] = max_tokens
+        if presence_penalty != 0:
+            optional_params["repetition_penalty"] = presence_penalty
+        optional_params["details"] = True
+        optional_params["task"] = task
     elif custom_llm_provider == "together_ai" or ("togethercomputer" in model):
         if stream:
             optional_params["stream_tokens"] = stream
@@ -867,23 +883,6 @@ def get_optional_params(  # use the openai defaults
         optional_params["num_beams"] = num_beams
         if max_tokens != float("inf"):
             optional_params["max_new_tokens"] = max_tokens
-    elif custom_llm_provider == "huggingface":
-        if temperature != 1:
-            optional_params["temperature"] = temperature
-        if top_p != 1:
-            optional_params["top_p"] = top_p
-        if n != 1:
-            optional_params["n"] = n
-        if stream:
-            optional_params["stream"] = stream
-        if stop != None:
-            optional_params["stop"] = stop
-        if max_tokens != float("inf"):
-            optional_params["max_new_tokens"] = max_tokens
-        if presence_penalty != 0:
-            optional_params["repetition_penalty"] = presence_penalty
-        optional_params["details"] = True
-        optional_params["task"] = task
     elif custom_llm_provider == "sagemaker":
         if "llama-2" in model:
             # llama-2 models on sagemaker support the following args
diff --git a/pyproject.toml b/pyproject.toml
index 5e247bbf8..6585ca5ba 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "0.1.620"
+version = "0.1.621"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT License"