diff --git a/community_resources/max_tokens.json b/community_resources/max_tokens.json
new file mode 100644
index 000000000..e61239f2c
--- /dev/null
+++ b/community_resources/max_tokens.json
@@ -0,0 +1,16 @@
+{"MODEL_COST":{
+    "gpt-3.5-turbo": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
+    "gpt-3.5-turbo-0613": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
+    "gpt-3.5-turbo-0301": {"max_tokens": 4000, "input_cost_per_token": 0.0000015, "output_cost_per_token": 0.000002},
+    "gpt-3.5-turbo-16k": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004},
+    "gpt-3.5-turbo-16k-0613": {"max_tokens": 16000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.000004},
+    "gpt-4": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006},
+    "gpt-4-0613": {"max_tokens": 8000, "input_cost_per_token": 0.000003, "output_cost_per_token": 0.00006},
+    "gpt-4-32k": {"max_tokens": 8000, "input_cost_per_token": 0.00006, "output_cost_per_token": 0.00012},
+    "claude-instant-1": {"max_tokens": 100000, "input_cost_per_token": 0.00000163, "output_cost_per_token": 0.00000551},
+    "claude-2": {"max_tokens": 100000, "input_cost_per_token": 0.00001102, "output_cost_per_token": 0.00003268},
+    "text-bison-001": {"max_tokens": 8192, "input_cost_per_token": 0.000004, "output_cost_per_token": 0.000004},
+    "chat-bison-001": {"max_tokens": 4096, "input_cost_per_token": 0.000002, "output_cost_per_token": 0.000002},
+    "command-nightly": {"max_tokens": 4096, "input_cost_per_token": 0.000015, "output_cost_per_token": 0.000015},
+    "replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1": {"max_tokens": 4096, "input_cost_per_token": 0.00000608, "output_cost_per_token": 0.00000608},
+}}
\ No newline at end of file
diff --git a/litellm/__init__.py b/litellm/__init__.py
index 785b1d293..3efd14343 100644
--- a/litellm/__init__.py
+++ b/litellm/__init__.py
@@ -9,6 +9,20 @@ azure_key = None
 anthropic_key = None 
 replicate_key = None 
 cohere_key = None 
+MAX_TOKENS = {
+    'gpt-3.5-turbo': 4000,
+    'gpt-3.5-turbo-0613': 4000,
+    'gpt-3.5-turbo-0301': 4000,
+    'gpt-3.5-turbo-16k': 16000,
+    'gpt-3.5-turbo-16k-0613': 16000,
+    'gpt-4': 8000,
+    'gpt-4-0613': 8000,
+    'gpt-4-32k': 32000,
+    'claude-instant-1': 100000,
+    'claude-2': 100000,
+    'command-nightly': 4096,
+    'replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1': 4096,
+}
 ####### PROXY PARAMS ################### configurable params if you use proxy models like Helicone
 api_base = None
 headers = None
diff --git a/litellm/__pycache__/__init__.cpython-311.pyc b/litellm/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 000000000..eee4d9926
Binary files /dev/null and b/litellm/__pycache__/__init__.cpython-311.pyc differ
diff --git a/litellm/__pycache__/main.cpython-311.pyc b/litellm/__pycache__/main.cpython-311.pyc
new file mode 100644
index 000000000..261f8e6ce
Binary files /dev/null and b/litellm/__pycache__/main.cpython-311.pyc differ
diff --git a/litellm/__pycache__/timeout.cpython-311.pyc b/litellm/__pycache__/timeout.cpython-311.pyc
new file mode 100644
index 000000000..c7211e4a7
Binary files /dev/null and b/litellm/__pycache__/timeout.cpython-311.pyc differ
diff --git a/litellm/__pycache__/utils.cpython-311.pyc b/litellm/__pycache__/utils.cpython-311.pyc
new file mode 100644
index 000000000..f298d2988
Binary files /dev/null and b/litellm/__pycache__/utils.cpython-311.pyc differ
diff --git a/litellm/main.py b/litellm/main.py
index 0d96073e7..09ac2f329 100644
--- a/litellm/main.py
+++ b/litellm/main.py
@@ -74,7 +74,7 @@ async def acompletion(*args, **kwargs):
   return await loop.run_in_executor(None, func)
 
 @client
-@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(2), reraise=True, retry_error_callback=lambda retry_state: setattr(retry_state.outcome, 'retry_variable', litellm.retry)) # retry call, turn this off by setting `litellm.retry = False`
+# @retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(2), reraise=True, retry_error_callback=lambda retry_state: setattr(retry_state.outcome, 'retry_variable', litellm.retry)) # retry call, turn this off by setting `litellm.retry = False`
 @timeout(60) ## set timeouts, in case calls hang (e.g. Azure) - default is 60s, override with `force_timeout`
 def completion(
     model, messages, # required params
@@ -255,8 +255,8 @@ def completion(
     elif model in litellm.cohere_models:
       if api_key:
         cohere_key = api_key
-      elif litellm.api_key:
-        cohere_key = litellm.api_key
+      elif litellm.cohere_key:
+        cohere_key = litellm.cohere_key
       else:
         cohere_key = os.environ.get("COHERE_API_KEY")
       co = cohere.Client(cohere_key)
@@ -330,6 +330,7 @@ def embedding(model, input=[], azure=False, force_timeout=60, logger_fn=None):
     logging(model=model, input=input, azure=azure, logger_fn=logger_fn, exception=e)
     ## Map to OpenAI Exception
     raise exception_type(model=model, original_exception=e)
+    raise e
 ####### HELPER FUNCTIONS ################
 ## Set verbose to true -> ```litellm.set_verbose = True```    
 def print_verbose(print_statement):
diff --git a/litellm/tests/__pycache__/test_exceptions.cpython-311-pytest-7.4.0.pyc b/litellm/tests/__pycache__/test_exceptions.cpython-311-pytest-7.4.0.pyc
index 50f8b39d3..62f9422f8 100644
Binary files a/litellm/tests/__pycache__/test_exceptions.cpython-311-pytest-7.4.0.pyc and b/litellm/tests/__pycache__/test_exceptions.cpython-311-pytest-7.4.0.pyc differ
diff --git a/litellm/tests/test_exceptions.py b/litellm/tests/test_exceptions.py
index 5472168f9..6a6902449 100644
--- a/litellm/tests/test_exceptions.py
+++ b/litellm/tests/test_exceptions.py
@@ -8,6 +8,7 @@ from litellm import embedding, completion
 from concurrent.futures import ThreadPoolExecutor
 import pytest
 
+# litellm.set_verbose = True
 #### What this tests ####
 #    This tests exception mapping -> trigger an exception from an llm provider -> assert if output is of the expected type
 
@@ -19,29 +20,32 @@ import pytest
 # Approach: Run each model through the test -> assert if the correct error (always the same one) is triggered
 
 # models = ["gpt-3.5-turbo", "chatgpt-test",  "claude-instant-1", "command-nightly"]
-
-# # Test 1: Context Window Errors
-# @pytest.mark.parametrize("model", models)
-# def test_context_window(model):
-#     sample_text = "how does a court case get to the Supreme Court?" * 100000
-#     messages = [{"content": sample_text, "role": "user"}]
-#     try:
-#         azure = model == "chatgpt-test"
-#         print(f"model: {model}")
-#         response = completion(model=model, messages=messages, azure=azure)
-#     except InvalidRequestError:
-#         print("InvalidRequestError")
-#         return
-#     except OpenAIError:
-#         print("OpenAIError")
-#         return
-#     except Exception as e:
-#         print("Uncaught Error in test_context_window")
-#         # print(f"Error Type: {type(e).__name__}")
-#         print(f"Uncaught Exception - {e}")
-#         pytest.fail(f"Error occurred: {e}")
-#     return
-
+models = ["command-nightly"]
+def logging_fn(model_call_dict):
+    print(f"model_call_dict: {model_call_dict['model']}")
+# Test 1: Context Window Errors
+@pytest.mark.parametrize("model", models)
+def test_context_window(model):
+    sample_text = "how does a court case get to the Supreme Court?" * 100000
+    messages = [{"content": sample_text, "role": "user"}]
+    try:
+        azure = model == "chatgpt-test"
+        print(f"model: {model}")
+        response = completion(model=model, messages=messages, azure=azure, logger_fn=logging_fn)
+        print(f"response: {response}")
+    except InvalidRequestError:
+        print("InvalidRequestError")
+        return
+    except OpenAIError:
+        print("OpenAIError")
+        return
+    except Exception as e:
+        print("Uncaught Error in test_context_window")
+        # print(f"Error Type: {type(e).__name__}")
+        print(f"Uncaught Exception - {e}")
+        pytest.fail(f"Error occurred: {e}")
+    return
+test_context_window("command-nightly")
 # # Test 2: InvalidAuth Errors
 # def logger_fn(model_call_object: dict):
 #     print(f"model call details: {model_call_object}")
@@ -64,7 +68,7 @@ import pytest
 #             os.environ["REPLICATE_API_KEY"] = "bad-key"
 #             os.environ["REPLICATE_API_TOKEN"] = "bad-key"
 #         print(f"model: {model}")
-#         response = completion(model=model, messages=messages, azure=azure, logger_fn=logger_fn)
+#         response = completion(model=model, messages=messages, azure=azure)
 #         print(f"response: {response}")
 #     except AuthenticationError as e:
 #         return
@@ -75,7 +79,6 @@ import pytest
 #         pytest.fail(f"Error occurred: {e}")
 #     return
 
-
 # # Test 3: Rate Limit Errors 
 # def test_model(model):
 #     try: 
diff --git a/setup.py b/setup.py
index 492b6e86a..9f54d6452 100644
--- a/setup.py
+++ b/setup.py
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
 
 setup(
     name='litellm',
-    version='0.1.230',
+    version='0.1.231',
     description='Library to easily interface with LLM API providers',
     author='BerriAI',
     packages=[