diff --git a/litellm/__pycache__/__init__.cpython-311.pyc b/litellm/__pycache__/__init__.cpython-311.pyc
index fd30776861..c047a81442 100644
Binary files a/litellm/__pycache__/__init__.cpython-311.pyc and b/litellm/__pycache__/__init__.cpython-311.pyc differ
diff --git a/litellm/__pycache__/main.cpython-311.pyc b/litellm/__pycache__/main.cpython-311.pyc
index 6b6ee11b22..06cc9494ee 100644
Binary files a/litellm/__pycache__/main.cpython-311.pyc and b/litellm/__pycache__/main.cpython-311.pyc differ
diff --git a/litellm/__pycache__/utils.cpython-311.pyc b/litellm/__pycache__/utils.cpython-311.pyc
index c4a41fd5a0..93c419413c 100644
Binary files a/litellm/__pycache__/utils.cpython-311.pyc and b/litellm/__pycache__/utils.cpython-311.pyc differ
diff --git a/litellm/main.py b/litellm/main.py
index adc8df9d3a..a51498140a 100644
--- a/litellm/main.py
+++ b/litellm/main.py
@@ -1,14 +1,13 @@
 import os, openai, cohere, replicate, sys
 from typing import Any
 from anthropic import Anthropic, HUMAN_PROMPT, AI_PROMPT
-import traceback
 from functools import partial
-import dotenv
-import traceback
+import dotenv, traceback, random, asyncio, time
+from copy import deepcopy
 import litellm
 from litellm import client, logging, exception_type, timeout, get_optional_params
-import random
-import asyncio
+import tiktoken
+encoding = tiktoken.get_encoding("cl100k_base")
 from tenacity import (
     retry,
     stop_after_attempt,
@@ -17,6 +16,17 @@ from tenacity import (
 ####### ENVIRONMENT VARIABLES ###################
 dotenv.load_dotenv() # Loading env variables using dotenv
 
+new_response = {
+        "choices": [
+          {
+            "finish_reason": "stop",
+            "index": 0,
+            "message": {
+                "role": "assistant"
+            }
+          }
+        ]
+      }
 # TODO move this to utils.py
 # TODO add translations
 # TODO see if this worked - model_name == krrish
@@ -44,6 +54,8 @@ def completion(
     *, return_async=False, api_key=None, force_timeout=60, azure=False, logger_fn=None, verbose=False
   ):
   try:
+    global new_response
+    model_response = deepcopy(new_response) # deep copy the default response format so we can mutate it and it's thread-safe. 
     # check if user passed in any of the OpenAI optional params
     optional_params = get_optional_params(
       functions=functions, function_call=function_call, 
@@ -128,6 +140,15 @@ def completion(
             model=model,
             prompt = prompt
         )
+      completion_response = response["choices"]["text"]
+      ## LOGGING
+      logging(model=model, input=prompt, azure=azure, additional_args={"max_tokens": max_tokens, "original_response": completion_response}, logger_fn=logger_fn)
+      ## RESPONSE OBJECT
+      model_response["choices"][0]["message"]["content"] = completion_response
+      model_response["created"] = response["created"]
+      model_response["model"] = model
+      model_response["usage"] = response["usage"]
+      response = model_response
     elif "replicate" in model:
       # replicate defaults to os.environ.get("REPLICATE_API_TOKEN")
       # checking in case user set it to REPLICATE_API_KEY instead 
@@ -153,19 +174,21 @@ def completion(
       response = ""
       for item in output: 
         response += item
-      new_response = {
-        "choices": [
-          {
-            "finish_reason": "stop",
-            "index": 0,
-            "message": {
-                "content": response,
-                "role": "assistant"
-            }
-          }
-        ]
-      }
-      response = new_response
+      completion_response = response
+      ## LOGGING
+      logging(model=model, input=prompt, azure=azure, additional_args={"max_tokens": max_tokens, "original_response": completion_response}, logger_fn=logger_fn)
+      prompt_tokens = len(encoding.encode(prompt))
+      completion_tokens = len(encoding.encode(completion_response))
+      ## RESPONSE OBJECT
+      model_response["choices"][0]["message"]["content"] = completion_response
+      model_response["created"] = time.time()
+      model_response["model"] = model
+      model_response["usage"] = {
+          "prompt_tokens": prompt_tokens,
+          "completion_tokens": completion_tokens,
+          "total_tokens": prompt_tokens + completion_tokens
+        }
+      response = model_response
     elif model in litellm.anthropic_models:
       #anthropic defaults to os.environ.get("ANTHROPIC_API_KEY")
       if api_key:
@@ -183,7 +206,6 @@ def completion(
           prompt += f"{HUMAN_PROMPT}{message['content']}"
       prompt += f"{AI_PROMPT}"
       anthropic = Anthropic()
-      # check if user passed in max_tokens != float('inf')
       if max_tokens != float('inf'):
         max_tokens_to_sample = max_tokens
       else:
@@ -196,20 +218,22 @@ def completion(
             prompt=prompt,
             max_tokens_to_sample=max_tokens_to_sample
         )
-      new_response = {
-        "choices": [
-          {
-            "finish_reason": "stop",
-            "index": 0,
-            "message": {
-                "content": completion.completion,
-                "role": "assistant"
-            }
-          }
-        ]
-      }
-      print_verbose(f"new response: {new_response}")
-      response = new_response
+      completion_response = completion.completion
+      ## LOGGING
+      logging(model=model, input=prompt, azure=azure, additional_args={"max_tokens": max_tokens, "original_response": completion_response}, logger_fn=logger_fn)
+      prompt_tokens = anthropic.count_tokens(prompt)
+      completion_tokens = anthropic.count_tokens(completion_response)
+      ## RESPONSE OBJECT
+      print(f"model_response: {model_response}")
+      model_response["choices"][0]["message"]["content"] = completion_response
+      model_response["created"] = time.time()
+      model_response["model"] = model
+      model_response["usage"] = {
+          "prompt_tokens": prompt_tokens,
+          "completion_tokens": completion_tokens,
+          "total_tokens": prompt_tokens + completion_tokens
+        }
+      response = model_response
     elif model in litellm.cohere_models:
       if api_key:
         cohere_key = api_key
@@ -226,19 +250,21 @@ def completion(
         model=model,
         prompt = prompt
       )
-      new_response = {
-          "choices": [
-              {
-                  "finish_reason": "stop",
-                  "index": 0,
-                  "message": {
-                      "content": response[0].text,
-                      "role": "assistant"
-                  }
-              }
-          ],
-      }
-      response = new_response
+      completion_response = response[0].text
+      ## LOGGING
+      logging(model=model, input=prompt, azure=azure, additional_args={"max_tokens": max_tokens, "original_response": completion_response}, logger_fn=logger_fn)
+      prompt_tokens = len(encoding.encode(prompt))
+      completion_tokens = len(encoding.encode(completion_response))
+      ## RESPONSE OBJECT
+      model_response["choices"][0]["message"]["content"] = completion_response
+      model_response["created"] = time.time()
+      model_response["model"] = model
+      model_response["usage"] = {
+          "prompt_tokens": prompt_tokens,
+          "completion_tokens": completion_tokens,
+          "total_tokens": prompt_tokens + completion_tokens
+        }
+      response = model_response
     else: 
       ## LOGGING
       logging(model=model, input=messages, azure=azure, logger_fn=logger_fn)
diff --git a/litellm/tests/__pycache__/test_completion.cpython-311-pytest-7.4.0.pyc b/litellm/tests/__pycache__/test_completion.cpython-311-pytest-7.4.0.pyc
index 2baa7bc5f0..0c5fa166d7 100644
Binary files a/litellm/tests/__pycache__/test_completion.cpython-311-pytest-7.4.0.pyc and b/litellm/tests/__pycache__/test_completion.cpython-311-pytest-7.4.0.pyc differ
diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py
index 88d2ef782b..c99b278b13 100644
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@@ -8,11 +8,14 @@ import pytest
 import litellm
 from litellm import embedding, completion
 
-litellm.set_verbose = True
+# litellm.set_verbose = True
 
 user_message = "Hello, whats the weather in San Francisco??"
 messages = [{ "content": user_message,"role": "user"}]
 
+def logger_fn(user_model_dict):
+    print(f"user_model_dict: {user_model_dict}")
+
 def test_completion_openai():
     try:
         response = completion(model="gpt-3.5-turbo", messages=messages)
@@ -83,7 +86,7 @@ def test_completion_azure():
 
 def test_completion_claude():
     try:
-        response = completion(model="claude-instant-1", messages=messages)
+        response = completion(model="claude-instant-1", messages=messages, logger_fn=logger_fn)
         # Add any assertions here to check the response
         print(response)
     except Exception as e:
@@ -97,7 +100,8 @@ def test_completion_cohere():
     except Exception as e:
         pytest.fail(f"Error occurred: {e}")
 
-
+# Replicate API endpoints are unstable -> throw random CUDA errors -> this means our tests can fail even if our tests weren't incorrect. 
+# [TODO] improve our try-except block to handle for these
 # def test_completion_replicate_llama():
 #     model_name = "replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1"
 #     try:
diff --git a/litellm/utils.py b/litellm/utils.py
index 94880e6699..8e62d3470a 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -33,13 +33,16 @@ def logging(model=None, input=None, azure=False, additional_args={}, logger_fn=N
     if azure:
       model_call_details["azure"] = azure
     if exception:
-      model_call_details["original_exception"] = exception
+      model_call_details["exception"] = exception
 
     if litellm.telemetry:
       safe_crash_reporting(model=model, exception=exception, azure=azure) # log usage-crash details. Do not log any user details. If you want to turn this off, set `litellm.telemetry=False`.
 
     if input:
       model_call_details["input"] = input
+    
+    if len(additional_args):
+       model_call_details["additional_args"] = additional_args
     # log additional call details -> api key, etc. 
     if model:
       if azure == True or model in litellm.open_ai_chat_completion_models or model in litellm.open_ai_chat_completion_models or model in litellm.open_ai_embedding_models:
@@ -53,7 +56,6 @@ def logging(model=None, input=None, azure=False, additional_args={}, logger_fn=N
         model_call_details["api_key"] = os.environ.get("ANTHROPIC_API_KEY")
       elif model in litellm.cohere_models:
         model_call_details["api_key"] = os.environ.get("COHERE_API_KEY")
-      model_call_details["additional_args"] = additional_args
     ## User Logging -> if you pass in a custom logging function or want to use sentry breadcrumbs
     print_verbose(f"Logging Details: logger_fn - {logger_fn} | callable(logger_fn) - {callable(logger_fn)}")
     if logger_fn and callable(logger_fn):
@@ -318,6 +320,7 @@ def exception_type(model, original_exception):
           exception_type = type(original_exception).__name__
         else:
           exception_type = ""
+        logging(model=model, additional_args={"error_str": error_str, "exception_type": exception_type, "original_exception": original_exception}, logger_fn=user_logger_fn)
         if "claude" in model: #one of the anthropics
           if "status_code" in original_exception:
             print_verbose(f"status_code: {original_exception.status_code}")
@@ -357,7 +360,7 @@ def exception_type(model, original_exception):
         raise original_exception
     except Exception as e:
       ## LOGGING
-      logging(logger_fn=user_logger_fn, additional_args={"original_exception": original_exception}, exception=e) 
+      logging(logger_fn=user_logger_fn, additional_args={"exception_mapping_worked": exception_mapping_worked, "original_exception": original_exception}, exception=e) 
       if exception_mapping_worked:
         raise e
       else: # don't let an error with mapping interrupt the user from receiving an error from the llm api calls 
diff --git a/pyproject.toml b/pyproject.toml
index ea45eb1cef..4fa2b2fd3f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "0.1.331"
+version = "0.1.341"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT License"
@@ -15,6 +15,7 @@ anthropic = "^0.3.7"
 replicate = "^0.10.0"
 python-dotenv = "^1.0.0"
 tenacity = "^8.0.1"
+tiktoken = "^0.4.0"
 
 [build-system]
 requires = ["poetry-core"]
diff --git a/requirements.txt b/requirements.txt
index 098bd48021..eca980d36a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,4 +5,5 @@ replicate
 pytest
 python-dotenv
 openai[datalib]
-tenacity
\ No newline at end of file
+tenacity
+tiktoken
\ No newline at end of file