Merge branch 'main' into main

2025-04-26 11:14:04 +00:00 · 2024-05-02 09:46:34 -03:00 · 2024-05-02 09:46:34 -03:00 · 78303b79ee
commit 78303b79ee
parent a9e2ef6212 caf19478af
124 changed files with 6716 additions and 1078 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -40,7 +40,7 @@ jobs:
            pip install "aioboto3==12.3.0"
            pip install langchain
            pip install lunary==0.2.5
-            pip install "langfuse==2.7.3"
+            pip install "langfuse==2.27.1"
            pip install numpydoc
            pip install traceloop-sdk==0.0.69
            pip install openai
--- a/.gitignore
+++ b/.gitignore
@ -51,3 +51,4 @@ loadtest_kub.yaml
 litellm/proxy/_new_secret_config.yaml
 litellm/proxy/_new_secret_config.yaml
 litellm/proxy/_super_secret_config.yaml
+litellm/proxy/_super_secret_config.yaml
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -7,7 +7,7 @@ repos:
    rev: 7.0.0  # The version of flake8 to use
    hooks:
    -  id: flake8
-       exclude: ^litellm/tests/|^litellm/proxy/proxy_cli.py|^litellm/integrations/|^litellm/proxy/tests/
+       exclude: ^litellm/tests/|^litellm/proxy/proxy_cli.py|^litellm/proxy/tests/
       additional_dependencies: [flake8-print]
       files: litellm/.*\.py
 -   repo: local
--- a/README.md
+++ b/README.md
@ -227,6 +227,7 @@ curl 'http://0.0.0.0:4000/key/generate' \
 | [perplexity-ai](https://docs.litellm.ai/docs/providers/perplexity)                  | ✅                                                      | ✅                                                                              | ✅                                                                                  | ✅                                                                                |
 | [Groq AI](https://docs.litellm.ai/docs/providers/groq)                              | ✅                                                      | ✅                                                                              | ✅                                                                                  | ✅                                                                                |
 | [anyscale](https://docs.litellm.ai/docs/providers/anyscale)                         | ✅                                                      | ✅                                                                              | ✅                                                                                  | ✅                                                                                |
+| [IBM - watsonx.ai](https://docs.litellm.ai/docs/providers/watsonx)                  | ✅                                                      | ✅                                                                              | ✅                                                                                  | ✅                                                                                | ✅ 
 | [voyage ai](https://docs.litellm.ai/docs/providers/voyage)                          |                                                         |                                                                                 |                                                                                     |                                                                                   | ✅                                                                            |
 | [xinference [Xorbits Inference]](https://docs.litellm.ai/docs/providers/xinference) |                                                         |                                                                                 |                                                                                     |                                                                                   | ✅                                                                            |

--- a/cookbook/liteLLM_IBM_Watsonx.ipynb
+++ b/cookbook/liteLLM_IBM_Watsonx.ipynb
--- a/docs/my-website/docs/debugging/local_debugging.md
+++ b/docs/my-website/docs/debugging/local_debugging.md
@ -23,6 +23,14 @@ response = completion(model="gpt-3.5-turbo", messages=messages)
 response = completion("command-nightly", messages)
 ```

+## JSON Logs 
+
+If you need to store the logs as JSON, just set the `litellm.json_logs = True`.
+
+We currently just log the raw POST request from litellm as a JSON - [**See Code**]. 
+
+[Share feedback here](https://github.com/BerriAI/litellm/issues)
+
 ## Logger Function 
 But sometimes all you care about is seeing exactly what's getting sent to your api call and what's being returned - e.g. if the api call is failing, why is that happening? what are the exact params being set? 

--- a/docs/my-website/docs/load_test.md
+++ b/docs/my-website/docs/load_test.md
@ -213,3 +213,349 @@ asyncio.run(loadtest_fn())

 ```

+## Multi-Instance TPM/RPM Load Test (Router)
+
+Test if your defined tpm/rpm limits are respected across multiple instances of the Router object. 
+
+In our test:
+- Max RPM per deployment is = 100 requests per minute
+- Max Throughput / min on router = 200 requests per minute (2 deployments)
+- Load we'll send through router = 600 requests per minute
+
+:::info
+
+If you don't want to call a real LLM API endpoint, you can setup a fake openai server. [See code](#extra---setup-fake-openai-server)
+
+:::
+
+### Code 
+
+Let's hit the router with 600 requests per minute. 
+
+Copy this script 👇. Save it as `test_loadtest_router.py` AND run it with `python3 test_loadtest_router.py`
+
+
+```python
+from litellm import Router 
+import litellm
+litellm.suppress_debug_info = True
+litellm.set_verbose = False
+import logging
+logging.basicConfig(level=logging.CRITICAL)
+import os, random, uuid, time, asyncio
+
+# Model list for OpenAI and Anthropic models
+model_list = [
+    {
+        "model_name": "fake-openai-endpoint",
+        "litellm_params": {
+            "model": "gpt-3.5-turbo",
+            "api_key": "my-fake-key",
+            "api_base": "http://0.0.0.0:8080",
+            "rpm": 100
+        },
+    },
+    {
+        "model_name": "fake-openai-endpoint",
+        "litellm_params": {
+            "model": "gpt-3.5-turbo",
+            "api_key": "my-fake-key",
+            "api_base": "http://0.0.0.0:8081",
+            "rpm": 100
+        },
+    },
+]
+
+router_1 = Router(model_list=model_list, num_retries=0, enable_pre_call_checks=True, routing_strategy="usage-based-routing-v2", redis_host=os.getenv("REDIS_HOST"), redis_port=os.getenv("REDIS_PORT"), redis_password=os.getenv("REDIS_PASSWORD"))
+router_2 = Router(model_list=model_list, num_retries=0, routing_strategy="usage-based-routing-v2", enable_pre_call_checks=True, redis_host=os.getenv("REDIS_HOST"), redis_port=os.getenv("REDIS_PORT"), redis_password=os.getenv("REDIS_PASSWORD"))
+
+
+
+async def router_completion_non_streaming():
+  try:
+    client: Router = random.sample([router_1, router_2], 1)[0] # randomly pick b/w clients
+    # print(f"client={client}")
+    response = await client.acompletion(
+              model="fake-openai-endpoint", # [CHANGE THIS] (if you call it something else on your proxy)
+              messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
+          )
+    return response
+  except Exception as e:
+    # print(e)
+    return None
+  
+async def loadtest_fn():
+    start = time.time()
+    n = 600  # Number of concurrent tasks
+    tasks = [router_completion_non_streaming() for _ in range(n)]
+    chat_completions = await asyncio.gather(*tasks)
+    successful_completions = [c for c in chat_completions if c is not None]
+    print(n, time.time() - start, len(successful_completions))
+
+def get_utc_datetime():
+    import datetime as dt
+    from datetime import datetime
+
+    if hasattr(dt, "UTC"):
+        return datetime.now(dt.UTC)  # type: ignore
+    else:
+        return datetime.utcnow()  # type: ignore
+
+
+# Run the event loop to execute the async function
+async def parent_fn():
+  for _ in range(10):
+    dt = get_utc_datetime()
+    current_minute = dt.strftime("%H-%M")
+    print(f"triggered new batch - {current_minute}")
+    await loadtest_fn()
+    await asyncio.sleep(10)
+
+asyncio.run(parent_fn())
+```
+## Multi-Instance TPM/RPM Load Test (Proxy)
+
+Test if your defined tpm/rpm limits are respected across multiple instances. 
+
+The quickest way to do this is by testing the [proxy](./proxy/quick_start.md). The proxy uses the [router](./routing.md) under the hood, so if you're using either of them, this test should work for you. 
+
+In our test:
+- Max RPM per deployment is = 100 requests per minute
+- Max Throughput / min on proxy = 200 requests per minute (2 deployments)
+- Load we'll send to proxy = 600 requests per minute
+
+
+So we'll send 600 requests per minute, but expect only 200 requests per minute to succeed.
+
+:::info
+
+If you don't want to call a real LLM API endpoint, you can setup a fake openai server. [See code](#extra---setup-fake-openai-server)
+
+:::
+
+### 1. Setup config 
+
+```yaml
+model_list:
+- litellm_params:
+    api_base: http://0.0.0.0:8080
+    api_key: my-fake-key
+    model: openai/my-fake-model
+    rpm: 100
+  model_name: fake-openai-endpoint
+- litellm_params:
+    api_base: http://0.0.0.0:8081
+    api_key: my-fake-key
+    model: openai/my-fake-model-2
+    rpm: 100
+  model_name: fake-openai-endpoint
+router_settings:
+  num_retries: 0
+  enable_pre_call_checks: true
+  redis_host: os.environ/REDIS_HOST ## 👈 IMPORTANT! Setup the proxy w/ redis
+  redis_password: os.environ/REDIS_PASSWORD
+  redis_port: os.environ/REDIS_PORT
+  routing_strategy: usage-based-routing-v2
+```
+
+### 2. Start proxy 2 instances
+
+**Instance 1**
+```bash
+litellm --config /path/to/config.yaml --port 4000
+
+## RUNNING on http://0.0.0.0:4000
+```
+
+**Instance 2**
+```bash
+litellm --config /path/to/config.yaml --port 4001
+
+## RUNNING on http://0.0.0.0:4001
+```
+
+### 3. Run Test 
+
+Let's hit the proxy with 600 requests per minute. 
+
+Copy this script 👇. Save it as `test_loadtest_proxy.py` AND run it with `python3 test_loadtest_proxy.py`
+
+```python
+from openai import AsyncOpenAI, AsyncAzureOpenAI
+import random, uuid
+import time, asyncio, litellm
+# import logging
+# logging.basicConfig(level=logging.DEBUG)
+#### LITELLM PROXY #### 
+litellm_client = AsyncOpenAI(
+    api_key="sk-1234", # [CHANGE THIS]
+    base_url="http://0.0.0.0:4000"
+)
+litellm_client_2 = AsyncOpenAI(
+    api_key="sk-1234", # [CHANGE THIS]
+    base_url="http://0.0.0.0:4001"
+)
+
+async def proxy_completion_non_streaming():
+  try:
+    client = random.sample([litellm_client, litellm_client_2], 1)[0] # randomly pick b/w clients
+    # print(f"client={client}")
+    response = await client.chat.completions.create(
+              model="fake-openai-endpoint", # [CHANGE THIS] (if you call it something else on your proxy)
+              messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
+          )
+    return response
+  except Exception as e:
+    # print(e)
+    return None
+  
+async def loadtest_fn():
+    start = time.time()
+    n = 600  # Number of concurrent tasks
+    tasks = [proxy_completion_non_streaming() for _ in range(n)]
+    chat_completions = await asyncio.gather(*tasks)
+    successful_completions = [c for c in chat_completions if c is not None]
+    print(n, time.time() - start, len(successful_completions))
+
+def get_utc_datetime():
+    import datetime as dt
+    from datetime import datetime
+
+    if hasattr(dt, "UTC"):
+        return datetime.now(dt.UTC)  # type: ignore
+    else:
+        return datetime.utcnow()  # type: ignore
+
+
+# Run the event loop to execute the async function
+async def parent_fn():
+  for _ in range(10):
+    dt = get_utc_datetime()
+    current_minute = dt.strftime("%H-%M")
+    print(f"triggered new batch - {current_minute}")
+    await loadtest_fn()
+    await asyncio.sleep(10)
+
+asyncio.run(parent_fn())
+
+```
+
+
+### Extra - Setup Fake OpenAI Server 
+
+Let's setup a fake openai server with a RPM limit of 100.
+
+Let's call our file `fake_openai_server.py`. 
+
+```
+# import sys, os
+# sys.path.insert(
+#     0, os.path.abspath("../")
+# )  # Adds the parent directory to the system path
+from fastapi import FastAPI, Request, status, HTTPException, Depends
+from fastapi.responses import StreamingResponse
+from fastapi.security import OAuth2PasswordBearer
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse
+from fastapi import FastAPI, Request, HTTPException, UploadFile, File
+import httpx, os, json
+from openai import AsyncOpenAI
+from typing import Optional
+from slowapi import Limiter
+from slowapi.util import get_remote_address
+from slowapi.errors import RateLimitExceeded
+from fastapi import FastAPI, Request, HTTPException
+from fastapi.responses import PlainTextResponse
+
+
+class ProxyException(Exception):
+    # NOTE: DO NOT MODIFY THIS
+    # This is used to map exactly to OPENAI Exceptions
+    def __init__(
+        self,
+        message: str,
+        type: str,
+        param: Optional[str],
+        code: Optional[int],
+    ):
+        self.message = message
+        self.type = type
+        self.param = param
+        self.code = code
+
+    def to_dict(self) -> dict:
+        """Converts the ProxyException instance to a dictionary."""
+        return {
+            "message": self.message,
+            "type": self.type,
+            "param": self.param,
+            "code": self.code,
+        }
+
+
+limiter = Limiter(key_func=get_remote_address)
+app = FastAPI()
+app.state.limiter = limiter
+
+@app.exception_handler(RateLimitExceeded)
+async def _rate_limit_exceeded_handler(request: Request, exc: RateLimitExceeded):
+    return JSONResponse(status_code=429,
+                        content={"detail": "Rate Limited!"})
+
+app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler)
+
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+# for completion
+@app.post("/chat/completions")
+@app.post("/v1/chat/completions")
+@limiter.limit("100/minute")
+async def completion(request: Request):
+    # raise HTTPException(status_code=429, detail="Rate Limited!")
+    return {
+        "id": "chatcmpl-123",
+        "object": "chat.completion",
+        "created": 1677652288,
+        "model": None,
+        "system_fingerprint": "fp_44709d6fcb",
+        "choices": [{
+            "index": 0,
+            "message": {
+            "role": "assistant",
+            "content": "\n\nHello there, how may I assist you today?",
+            },
+            "logprobs": None,
+            "finish_reason": "stop"
+        }],
+        "usage": {
+            "prompt_tokens": 9,
+            "completion_tokens": 12,
+            "total_tokens": 21
+        }
+    }
+
+if __name__ == "__main__":
+    import socket
+    import uvicorn
+    port = 8080
+    while True:
+        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+        result = sock.connect_ex(('0.0.0.0', port))
+        if result != 0:
+            print(f"Port {port} is available, starting server...")
+            break
+        else:
+            port += 1
+
+    uvicorn.run(app, host="0.0.0.0", port=port)
+```
+
+```bash
+python3 fake_openai_server.py
+```
--- a/docs/my-website/docs/observability/custom_callback.md
+++ b/docs/my-website/docs/observability/custom_callback.md
@ -331,49 +331,25 @@ response = litellm.completion(model="gpt-3.5-turbo", messages=messages, metadata
 ## Examples

 ### Custom Callback to track costs for Streaming + Non-Streaming
+By default, the response cost is accessible in the logging object via `kwargs["response_cost"]` on success (sync + async)
 ```python

+# Step 1. Write your custom callback function
 def track_cost_callback(
    kwargs,                 # kwargs to completion
    completion_response,    # response from completion
    start_time, end_time    # start/end time
 ):
    try:
-        # init logging config
-        logging.basicConfig(
-                filename='cost.log',
-                level=logging.INFO,
-                format='%(asctime)s - %(message)s',
-                datefmt='%Y-%m-%d %H:%M:%S'
-        )
-
-        # check if it has collected an entire stream response
-        if "complete_streaming_response" in kwargs:
-            # for tracking streaming cost we pass the "messages" and the output_text to litellm.completion_cost 
-            completion_response=kwargs["complete_streaming_response"]
-            input_text = kwargs["messages"]
-            output_text = completion_response["choices"][0]["message"]["content"]
-            response_cost = litellm.completion_cost(
-                model = kwargs["model"],
-                messages = input_text,
-                completion=output_text
-            )
-            print("streaming response_cost", response_cost)
-            logging.info(f"Model {kwargs['model']} Cost: ${response_cost:.8f}")
-
-        # for non streaming responses
-        else:
-            # we pass the completion_response obj
-            if kwargs["stream"] != True:
-                response_cost = litellm.completion_cost(completion_response=completion_response)
+        response_cost = kwargs["response_cost"] # litellm calculates response cost for you
        print("regular response_cost", response_cost)
-                logging.info(f"Model {completion_response.model} Cost: ${response_cost:.8f}")
    except:
        pass

-# Assign the custom callback function
+# Step 2. Assign the custom callback function
 litellm.success_callback = [track_cost_callback]

+# Step 3. Make litellm.completion call
 response = completion(
    model="gpt-3.5-turbo",
    messages=[
--- a/docs/my-website/docs/observability/langfuse_integration.md
+++ b/docs/my-website/docs/observability/langfuse_integration.md
@ -121,10 +121,12 @@ response = completion(
  metadata={
      "generation_name": "ishaan-test-generation",  # set langfuse Generation Name
      "generation_id": "gen-id22",                  # set langfuse Generation ID 
-      "trace_id": "trace-id22",                     # set langfuse Trace ID
      "trace_user_id": "user-id2",                  # set langfuse Trace User ID
      "session_id": "session-1",                    # set langfuse Session ID
      "tags": ["tag1", "tag2"]                      # set langfuse Tags
+      "trace_id": "trace-id22",                     # set langfuse Trace ID
+      ### OR ### 
+      "existing_trace_id": "trace-id22",                     # if generation is continuation of past trace. This prevents default behaviour of setting a trace name
  },
 )

@ -167,6 +169,9 @@ messages = [
 chat(messages)
 ```

+## Redacting Messages, Response Content from Langfuse Logging 
+
+Set `litellm.turn_off_message_logging=True` This will prevent the messages and responses from being logged to langfuse, but request metadata will still be logged.

 ## Troubleshooting & Errors
 ### Data not getting logged to Langfuse ? 
--- a/docs/my-website/docs/observability/openmeter.md
+++ b/docs/my-website/docs/observability/openmeter.md
@ -0,0 +1,97 @@
+import Image from '@theme/IdealImage';
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# OpenMeter - Usage-Based Billing
+
+[OpenMeter](https://openmeter.io/) is an Open Source Usage-Based Billing solution for AI/Cloud applications. It integrates with Stripe for easy billing.
+
+<Image img={require('../../img/openmeter.png')} />
+
+:::info
+We want to learn how we can make the callbacks better! Meet the LiteLLM [founders](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version) or
+join our [discord](https://discord.gg/wuPM9dRgDw)
+::: 
+
+
+## Quick Start
+Use just 2 lines of code, to instantly log your responses **across all providers** with OpenMeter
+
+Get your OpenMeter API Key from https://openmeter.cloud/meters
+
+```python
+litellm.success_callback = ["openmeter"] # logs cost + usage of successful calls to openmeter
+```
+
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+# pip install langfuse 
+import litellm
+import os
+
+# from https://openmeter.cloud
+os.environ["OPENMETER_API_ENDPOINT"] = ""
+os.environ["OPENMETER_API_KEY"] = ""
+
+# LLM API Keys
+os.environ['OPENAI_API_KEY']=""
+
+# set langfuse as a callback, litellm will send the data to langfuse
+litellm.success_callback = ["openmeter"] 
+ 
+# openai call
+response = litellm.completion(
+  model="gpt-3.5-turbo",
+  messages=[
+    {"role": "user", "content": "Hi 👋 - i'm openai"}
+  ]
+)
+```
+
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+1. Add to Config.yaml
+```yaml
+model_list:
+- litellm_params:
+    api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/
+    api_key: my-fake-key
+    model: openai/my-fake-model
+  model_name: fake-openai-endpoint
+
+litellm_settings:
+  success_callback: ["openmeter"] # 👈 KEY CHANGE
+```
+
+2. Start Proxy
+
+```
+litellm --config /path/to/config.yaml
+```
+
+3. Test it! 
+
+```bash
+curl --location 'http://0.0.0.0:4000/chat/completions' \
+--header 'Content-Type: application/json' \
+--data ' {
+      "model": "fake-openai-endpoint",
+      "messages": [
+        {
+          "role": "user",
+          "content": "what llm are you"
+        }
+      ],
+    }
+'
+```
+
+</TabItem>
+</Tabs>
+
+
+<Image img={require('../../img/openmeter_img_2.png')} />
--- a/docs/my-website/docs/observability/sentry.md
+++ b/docs/my-website/docs/observability/sentry.md
@ -40,5 +40,9 @@ response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content
 print(response)
 ```

+## Redacting Messages, Response Content from Sentry Logging 
+
+Set `litellm.turn_off_message_logging=True` This will prevent the messages and responses from being logged to sentry, but request metadata will still be logged.
+
 [Let us know](https://github.com/BerriAI/litellm/issues/new?assignees=&labels=enhancement&projects=&template=feature_request.yml&title=%5BFeature%5D%3A+) if you need any additional options from Sentry. 

--- a/docs/my-website/docs/providers/mistral.md
+++ b/docs/my-website/docs/providers/mistral.md
@ -53,6 +53,50 @@ All models listed here https://docs.mistral.ai/platform/endpoints are supported.
 | open-mixtral-8x22b | `completion(model="mistral/open-mixtral-8x22b", messages)` | 


+## Function Calling 
+
+```python
+from litellm import completion
+
+# set env
+os.environ["MISTRAL_API_KEY"] = "your-api-key"
+
+tools = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_current_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "location": {
+                        "type": "string",
+                        "description": "The city and state, e.g. San Francisco, CA",
+                    },
+                    "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
+                },
+                "required": ["location"],
+            },
+        },
+    }
+]
+messages = [{"role": "user", "content": "What's the weather like in Boston today?"}]
+
+response = completion(
+    model="mistral/mistral-large-latest",
+    messages=messages,
+    tools=tools,
+    tool_choice="auto",
+)
+# Add any assertions, here to check response args
+print(response)
+assert isinstance(response.choices[0].message.tool_calls[0].function.name, str)
+assert isinstance(
+    response.choices[0].message.tool_calls[0].function.arguments, str
+)
+```
+
 ## Sample Usage - Embedding
 ```python
 from litellm import embedding
--- a/docs/my-website/docs/providers/vllm.md
+++ b/docs/my-website/docs/providers/vllm.md
@ -4,6 +4,13 @@ LiteLLM supports all models on VLLM.

 🚀[Code Tutorial](https://github.com/BerriAI/litellm/blob/main/cookbook/VLLM_Model_Testing.ipynb)

+
+:::info
+
+To call a HOSTED VLLM Endpoint use [these docs](./openai_compatible.md)
+
+:::
+
 ### Quick Start
 ```
 pip install litellm vllm
--- a/docs/my-website/docs/providers/watsonx.md
+++ b/docs/my-website/docs/providers/watsonx.md
@ -0,0 +1,284 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# IBM watsonx.ai
+
+LiteLLM supports all IBM [watsonx.ai](https://watsonx.ai/) foundational models and embeddings.
+
+## Environment Variables
+```python
+os.environ["WATSONX_URL"] = ""  # (required) Base URL of your WatsonX instance
+# (required) either one of the following:
+os.environ["WATSONX_APIKEY"] = "" # IBM cloud API key
+os.environ["WATSONX_TOKEN"] = "" # IAM auth token
+# optional - can also be passed as params to completion() or embedding()
+os.environ["WATSONX_PROJECT_ID"] = "" # Project ID of your WatsonX instance
+os.environ["WATSONX_DEPLOYMENT_SPACE_ID"] = "" # ID of your deployment space to use deployed models
+```
+
+See [here](https://cloud.ibm.com/apidocs/watsonx-ai#api-authentication) for more information on how to get an access token to authenticate to watsonx.ai.
+
+## Usage
+
+<a target="_blank" href="https://colab.research.google.com/github/BerriAI/litellm/blob/main/cookbook/liteLLM_IBM_Watsonx.ipynb">
+  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
+</a>
+
+```python
+import os
+from litellm import completion
+
+os.environ["WATSONX_URL"] = ""
+os.environ["WATSONX_APIKEY"] = ""
+
+response = completion(
+  model="watsonx/ibm/granite-13b-chat-v2",
+  messages=[{ "content": "what is your favorite colour?","role": "user"}],
+  project_id="<my-project-id>" # or pass with os.environ["WATSONX_PROJECT_ID"]
+)
+
+response = completion(
+  model="watsonx/meta-llama/llama-3-8b-instruct",
+  messages=[{ "content": "what is your favorite colour?","role": "user"}],
+  project_id="<my-project-id>"
+)
+```
+
+## Usage - Streaming
+```python
+import os
+from litellm import completion
+
+os.environ["WATSONX_URL"] = ""
+os.environ["WATSONX_APIKEY"] = ""
+os.environ["WATSONX_PROJECT_ID"] = ""
+
+response = completion(
+  model="watsonx/ibm/granite-13b-chat-v2",
+  messages=[{ "content": "what is your favorite colour?","role": "user"}],
+  stream=True
+)
+for chunk in response:
+  print(chunk)
+```
+
+#### Example Streaming Output Chunk
+```json
+{
+  "choices": [
+    {
+      "finish_reason": null,
+      "index": 0,
+      "delta": {
+        "content": "I don't have a favorite color, but I do like the color blue. What's your favorite color?"
+      }
+    }
+  ],
+  "created": null,
+  "model": "watsonx/ibm/granite-13b-chat-v2",
+  "usage": {
+    "prompt_tokens": null,
+    "completion_tokens": null,
+    "total_tokens": null
+  }
+}
+```
+
+## Usage - Models in deployment spaces
+
+Models that have been deployed to a deployment space (e.g.: tuned models) can be called using the `deployment/<deployment_id>` format (where `<deployment_id>` is the ID of the deployed model in your deployment space). 
+
+The ID of your deployment space must also be set in the environment variable `WATSONX_DEPLOYMENT_SPACE_ID` or passed to the function as `space_id=<deployment_space_id>`. 
+
+```python
+import litellm
+response = litellm.completion(
+    model="watsonx/deployment/<deployment_id>",
+    messages=[{"content": "Hello, how are you?", "role": "user"}],
+    space_id="<deployment_space_id>"
+)
+```
+
+## Usage - Embeddings
+
+LiteLLM also supports making requests to IBM watsonx.ai embedding models. The credential needed for this is the same as for completion.
+
+```python
+from litellm import embedding
+
+response = embedding(
+    model="watsonx/ibm/slate-30m-english-rtrvr",
+    input=["What is the capital of France?"],
+    project_id="<my-project-id>"
+)
+print(response)
+# EmbeddingResponse(model='ibm/slate-30m-english-rtrvr', data=[{'object': 'embedding', 'index': 0, 'embedding': [-0.037463713, -0.02141933, -0.02851813, 0.015519324, ..., -0.0021367231, -0.01704561, -0.001425816, 0.0035238306]}], object='list', usage=Usage(prompt_tokens=8, total_tokens=8))
+```
+
+## OpenAI Proxy Usage 
+
+Here's how to call IBM watsonx.ai with the LiteLLM Proxy Server
+
+### 1. Save keys in your environment
+
+```bash
+export WATSONX_URL=""
+export WATSONX_APIKEY=""
+export WATSONX_PROJECT_ID=""
+```
+
+### 2. Start the proxy 
+
+<Tabs>
+<TabItem value="cli" label="CLI">
+
+```bash
+$ litellm --model watsonx/meta-llama/llama-3-8b-instruct
+
+# Server running on http://0.0.0.0:4000
+```
+
+</TabItem>
+<TabItem value="config" label="config.yaml">
+
+```yaml
+model_list:
+  - model_name: llama-3-8b
+    litellm_params:
+      # all params accepted by litellm.completion()
+      model: watsonx/meta-llama/llama-3-8b-instruct
+      api_key: "os.environ/WATSONX_API_KEY" # does os.getenv("WATSONX_API_KEY")
+```
+</TabItem>
+</Tabs>
+
+### 3. Test it
+
+
+<Tabs>
+<TabItem value="Curl" label="Curl Request">
+
+```shell
+curl --location 'http://0.0.0.0:4000/chat/completions' \
+--header 'Content-Type: application/json' \
+--data ' {
+      "model": "llama-3-8b",
+      "messages": [
+        {
+          "role": "user",
+          "content": "what is your favorite colour?"
+        }
+      ]
+    }
+'
+```
+</TabItem>
+<TabItem value="openai" label="OpenAI v1.0.0+">
+
+```python
+import openai
+client = openai.OpenAI(
+    api_key="anything",
+    base_url="http://0.0.0.0:4000"
+)
+
+# request sent to model set on litellm proxy, `litellm --model`
+response = client.chat.completions.create(model="llama-3-8b", messages=[
+    {
+        "role": "user",
+        "content": "what is your favorite colour?"
+    }
+])
+
+print(response)
+
+```
+</TabItem>
+<TabItem value="langchain" label="Langchain">
+
+```python
+from langchain.chat_models import ChatOpenAI
+from langchain.prompts.chat import (
+    ChatPromptTemplate,
+    HumanMessagePromptTemplate,
+    SystemMessagePromptTemplate,
+)
+from langchain.schema import HumanMessage, SystemMessage
+
+chat = ChatOpenAI(
+    openai_api_base="http://0.0.0.0:4000", # set openai_api_base to the LiteLLM Proxy
+    model = "llama-3-8b",
+    temperature=0.1
+)
+
+messages = [
+    SystemMessage(
+        content="You are a helpful assistant that im using to make a test request to."
+    ),
+    HumanMessage(
+        content="test from litellm. tell me why it's amazing in 1 sentence"
+    ),
+]
+response = chat(messages)
+
+print(response)
+```
+</TabItem>
+</Tabs>
+
+
+## Authentication
+
+### Passing credentials as parameters
+
+You can also pass the credentials as parameters to the completion and embedding functions.
+
+```python
+import os
+from litellm import completion
+
+response = completion(
+            model="watsonx/ibm/granite-13b-chat-v2",
+            messages=[{ "content": "What is your favorite color?","role": "user"}],
+            url="",
+            api_key="",
+            project_id=""
+)
+```
+
+
+## Supported IBM watsonx.ai Models
+
+Here are some examples of models available in IBM watsonx.ai that you can use with LiteLLM:
+
+| Mode Name | Command |
+| ---------- | --------- |
+| Flan T5 XXL | `completion(model=watsonx/google/flan-t5-xxl, messages=messages)` |
+| Flan Ul2 | `completion(model=watsonx/google/flan-ul2, messages=messages)` |
+| Mt0 XXL | `completion(model=watsonx/bigscience/mt0-xxl, messages=messages)` |
+| Gpt Neox | `completion(model=watsonx/eleutherai/gpt-neox-20b, messages=messages)` |
+| Mpt 7B Instruct2 | `completion(model=watsonx/ibm/mpt-7b-instruct2, messages=messages)` |
+| Starcoder | `completion(model=watsonx/bigcode/starcoder, messages=messages)` |
+| Llama 2 70B Chat | `completion(model=watsonx/meta-llama/llama-2-70b-chat, messages=messages)` |
+| Llama 2 13B Chat | `completion(model=watsonx/meta-llama/llama-2-13b-chat, messages=messages)` |
+| Granite 13B Instruct | `completion(model=watsonx/ibm/granite-13b-instruct-v1, messages=messages)` |
+| Granite 13B Chat | `completion(model=watsonx/ibm/granite-13b-chat-v1, messages=messages)` |
+| Flan T5 XL | `completion(model=watsonx/google/flan-t5-xl, messages=messages)` |
+| Granite 13B Chat V2 | `completion(model=watsonx/ibm/granite-13b-chat-v2, messages=messages)` |
+| Granite 13B Instruct V2 | `completion(model=watsonx/ibm/granite-13b-instruct-v2, messages=messages)` |
+| Elyza Japanese Llama 2 7B Instruct | `completion(model=watsonx/elyza/elyza-japanese-llama-2-7b-instruct, messages=messages)` |
+| Mixtral 8X7B Instruct V01 Q | `completion(model=watsonx/ibm-mistralai/mixtral-8x7b-instruct-v01-q, messages=messages)` |
+
+
+For a list of all available models in watsonx.ai, see [here](https://dataplatform.cloud.ibm.com/docs/content/wsj/analyze-data/fm-models.html?context=wx&locale=en&audience=wdp).
+
+
+## Supported IBM watsonx.ai Embedding Models
+
+| Model Name           | Function Call                               |
+|----------------------|---------------------------------------------|
+| Slate 30m | `embedding(model="watsonx/ibm/slate-30m-english-rtrvr", input=input)` |
+| Slate 125m  | `embedding(model="watsonx/ibm/slate-125m-english-rtrvr", input=input)` |
+
+
+For a list of all available embedding models in watsonx.ai, see [here](https://dataplatform.cloud.ibm.com/docs/content/wsj/analyze-data/fm-models-embed.html?context=wx).
--- a/docs/my-website/docs/proxy/alerting.md
+++ b/docs/my-website/docs/proxy/alerting.md
@ -1,13 +1,13 @@
-# Slack Alerting
+# 🚨 Alerting 

 Get alerts for:
- hanging LLM api calls
- failed LLM api calls
- slow LLM api calls
- budget Tracking per key/user:
+- Hanging LLM api calls
+- Failed LLM api calls
+- Slow LLM api calls
+- Budget Tracking per key/user:
    - When a User/Key crosses their Budget 
    - When a User/Key is 15% away from crossing their Budget
- failed db read/writes
+- Failed db read/writes

 ## Quick Start

--- a/docs/my-website/docs/proxy/configs.md
+++ b/docs/my-website/docs/proxy/configs.md
@ -62,9 +62,11 @@ model_list:

 litellm_settings: # module level litellm settings - https://github.com/BerriAI/litellm/blob/main/litellm/__init__.py
  drop_params: True
+  success_callback: ["langfuse"] # OPTIONAL - if you want to start sending LLM Logs to Langfuse. Make sure to set `LANGFUSE_PUBLIC_KEY` and `LANGFUSE_SECRET_KEY` in your env

 general_settings: 
  master_key: sk-1234 # [OPTIONAL] Only use this if you to require all calls to contain this key (Authorization: Bearer sk-1234)
+  alerting: ["slack"] # [OPTIONAL] If you want Slack Alerts for Hanging LLM requests, Slow llm responses, Budget Alerts. Make sure to set `SLACK_WEBHOOK_URL` in your env
 ```
 :::info

--- a/docs/my-website/docs/proxy/deploy.md
+++ b/docs/my-website/docs/proxy/deploy.md
@ -11,40 +11,37 @@ You can find the Dockerfile to build litellm proxy [here](https://github.com/Ber

 <TabItem value="basic" label="Basic">

-**Step 1. Create a file called `litellm_config.yaml`**
+### Step 1. CREATE config.yaml 

-  Example `litellm_config.yaml` (the `os.environ/` prefix means litellm will read `AZURE_API_BASE` from the env)
-  ```yaml
-  model_list:
+Example `litellm_config.yaml` 
+
+```yaml
+model_list:
  - model_name: azure-gpt-3.5
    litellm_params:
      model: azure/<your-azure-model-deployment>
-        api_base: os.environ/AZURE_API_BASE
-        api_key: os.environ/AZURE_API_KEY
+      api_base: os.environ/AZURE_API_BASE # runs os.getenv("AZURE_API_BASE")
+      api_key: os.environ/AZURE_API_KEY # runs os.getenv("AZURE_API_KEY")
      api_version: "2023-07-01-preview"
-  ```
+```

-**Step 2. Run litellm docker image**

-  See the latest available ghcr docker image here:
-  https://github.com/berriai/litellm/pkgs/container/litellm

-  Your litellm config.yaml should be called `litellm_config.yaml` in the directory you run this command. 
-  The `-v` command will mount that file
+### Step 2. RUN Docker Image

-  Pass `AZURE_API_KEY` and `AZURE_API_BASE` since we set them in step 1
-
-  ```shell
-  docker run \
+```shell
+docker run \
    -v $(pwd)/litellm_config.yaml:/app/config.yaml \
    -e AZURE_API_KEY=d6*********** \
    -e AZURE_API_BASE=https://openai-***********/ \
    -p 4000:4000 \
    ghcr.io/berriai/litellm:main-latest \
    --config /app/config.yaml --detailed_debug
-  ```
+```

-**Step 3. Send a Test Request**
+Get Latest Image 👉 [here](https://github.com/berriai/litellm/pkgs/container/litellm)
+
+### Step 3. TEST Request

  Pass `model=azure-gpt-3.5` this was set on step 1

@ -231,13 +228,16 @@ Your OpenAI proxy server is now running on `http://127.0.0.1:4000`.
 | Docs | When to Use |
 | --- | --- |
 | [Quick Start](#quick-start) | call 100+ LLMs + Load Balancing |
-| [Deploy with Database](#deploy-with-database) | + use Virtual Keys + Track Spend |
+| [Deploy with Database](#deploy-with-database) | + use Virtual Keys + Track Spend (Note: When deploying with a database providing a `DATABASE_URL` and `LITELLM_MASTER_KEY` are required in your env ) |
 | [LiteLLM container + Redis](#litellm-container--redis) | + load balance across multiple litellm containers |
 | [LiteLLM Database container + PostgresDB + Redis](#litellm-database-container--postgresdb--redis) | + use Virtual Keys + Track Spend + load balance across multiple litellm containers |

 ## Deploy with Database
 ### Docker, Kubernetes, Helm Chart

+Requirements:
+- Need a postgres database (e.g. [Supabase](https://supabase.com/), [Neon](https://neon.tech/), etc) Set `DATABASE_URL=postgresql://<user>:<password>@<host>:<port>/<dbname>` in your env 
+- Set a `LITELLM_MASTER_KEY`, this is your Proxy Admin key - you can use this to create other keys (🚨 must start with `sk-`)

 <Tabs>

@ -252,6 +252,8 @@ docker pull ghcr.io/berriai/litellm-database:main-latest
 ```shell
 docker run \
    -v $(pwd)/litellm_config.yaml:/app/config.yaml \
+    -e LITELLM_MASTER_KEY=sk-1234 \
+    -e DATABASE_URL=postgresql://<user>:<password>@<host>:<port>/<dbname> \
    -e AZURE_API_KEY=d6*********** \
    -e AZURE_API_BASE=https://openai-***********/ \
    -p 4000:4000 \
@ -267,12 +269,12 @@ Your OpenAI proxy server is now running on `http://0.0.0.0:4000`.
 #### Step 1. Create deployment.yaml

 ```yaml
-   apiVersion: apps/v1
-   kind: Deployment
-   metadata:
+apiVersion: apps/v1
+kind: Deployment
+metadata:
  name: litellm-deployment
-   spec:
-     replicas: 1
+spec:
+  replicas: 3
  selector:
    matchLabels:
      app: litellm
@ -283,10 +285,47 @@ Your OpenAI proxy server is now running on `http://0.0.0.0:4000`.
    spec:
      containers:
        - name: litellm-container
-             image: ghcr.io/berriai/litellm-database:main-latest
+          image: ghcr.io/berriai/litellm:main-latest
+          imagePullPolicy: Always
          env:
+            - name: AZURE_API_KEY
+              value: "d6******"
+            - name: AZURE_API_BASE
+              value: "https://ope******"
+            - name: LITELLM_MASTER_KEY
+              value: "sk-1234"
            - name: DATABASE_URL
-                value: postgresql://<user>:<password>@<host>:<port>/<dbname>
+              value: "po**********"
+          args:
+            - "--config"
+            - "/app/proxy_config.yaml"  # Update the path to mount the config file
+          volumeMounts:                 # Define volume mount for proxy_config.yaml
+            - name: config-volume
+              mountPath: /app
+              readOnly: true
+          livenessProbe:
+            httpGet:
+              path: /health/liveliness
+              port: 4000
+            initialDelaySeconds: 120
+            periodSeconds: 15
+            successThreshold: 1
+            failureThreshold: 3
+            timeoutSeconds: 10
+          readinessProbe:
+            httpGet:
+              path: /health/readiness
+              port: 4000
+            initialDelaySeconds: 120
+            periodSeconds: 15
+            successThreshold: 1
+            failureThreshold: 3
+            timeoutSeconds: 10
+      volumes:  # Define volume to mount proxy_config.yaml
+        - name: config-volume
+          configMap:
+            name: litellm-config  
+
 ```

 ```bash
--- a/docs/my-website/docs/proxy/logging.md
+++ b/docs/my-website/docs/proxy/logging.md
@ -10,6 +10,7 @@ Log Proxy Input, Output, Exceptions using Custom Callbacks, Langfuse, OpenTeleme
 - [Async Custom Callbacks](#custom-callback-class-async)
 - [Async Custom Callback APIs](#custom-callback-apis-async)
 - [Logging to Langfuse](#logging-proxy-inputoutput---langfuse)
+- [Logging to OpenMeter](#logging-proxy-inputoutput---langfuse)
 - [Logging to s3 Buckets](#logging-proxy-inputoutput---s3-buckets)
 - [Logging to DataDog](#logging-proxy-inputoutput---datadog)
 - [Logging to DynamoDB](#logging-proxy-inputoutput---dynamodb)
@ -401,7 +402,7 @@ litellm_settings:
 Start the LiteLLM Proxy and make a test request to verify the logs reached your callback API 

 ## Logging Proxy Input/Output - Langfuse
-We will use the `--config` to set `litellm.success_callback = ["langfuse"]` this will log all successfull LLM calls to langfuse
+We will use the `--config` to set `litellm.success_callback = ["langfuse"]` this will log all successfull LLM calls to langfuse. Make sure to set `LANGFUSE_PUBLIC_KEY` and `LANGFUSE_SECRET_KEY` in your environment

 **Step 1** Install langfuse

@ -419,7 +420,13 @@ litellm_settings:
  success_callback: ["langfuse"]
 ```

-**Step 3**: Start the proxy, make a test request
+**Step 3**: Set required env variables for logging to langfuse
+```shell
+export LANGFUSE_PUBLIC_KEY="pk_kk"
+export LANGFUSE_SECRET_KEY="sk_ss
+```
+
+**Step 4**: Start the proxy, make a test request

 Start proxy
 ```shell
@ -569,6 +576,75 @@ curl -X POST 'http://0.0.0.0:4000/key/generate' \

 All requests made with these keys will log data to their team-specific logging.

+### Redacting Messages, Response Content from Langfuse Logging 
+
+Set `litellm.turn_off_message_logging=True` This will prevent the messages and responses from being logged to langfuse, but request metadata will still be logged.
+
+```yaml
+model_list:
+ - model_name: gpt-3.5-turbo
+    litellm_params:
+      model: gpt-3.5-turbo
+litellm_settings:
+  success_callback: ["langfuse"]
+  turn_off_message_logging: True
+```
+
+
+
+## Logging Proxy Cost + Usage - OpenMeter
+
+Bill customers according to their LLM API usage with [OpenMeter](../observability/openmeter.md)
+
+**Required Env Variables**
+
+```bash
+# from https://openmeter.cloud
+export OPENMETER_API_ENDPOINT="" # defaults to https://openmeter.cloud
+export OPENMETER_API_KEY=""
+```
+
+### Quick Start 
+
+1. Add to Config.yaml
+```yaml
+model_list:
+- litellm_params:
+    api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/
+    api_key: my-fake-key
+    model: openai/my-fake-model
+  model_name: fake-openai-endpoint
+
+litellm_settings:
+  success_callback: ["openmeter"] # 👈 KEY CHANGE
+```
+
+2. Start Proxy
+
+```
+litellm --config /path/to/config.yaml
+```
+
+3. Test it! 
+
+```bash
+curl --location 'http://0.0.0.0:4000/chat/completions' \
+--header 'Content-Type: application/json' \
+--data ' {
+      "model": "fake-openai-endpoint",
+      "messages": [
+        {
+          "role": "user",
+          "content": "what llm are you"
+        }
+      ],
+    }
+'
+```
+
+
+<Image img={require('../../img/openmeter_img_2.png')} />
+
 ## Logging Proxy Input/Output - DataDog
 We will use the `--config` to set `litellm.success_callback = ["datadog"]` this will log all successfull LLM calls to DataDog

--- a/docs/my-website/docs/routing.md
+++ b/docs/my-website/docs/routing.md
@ -95,7 +95,7 @@ print(response)
 - `router.image_generation()` - completion calls in OpenAI `/v1/images/generations` endpoint format
 - `router.aimage_generation()` - async image generation calls

-### Advanced - Routing Strategies
+## Advanced - Routing Strategies
 #### Routing Strategies - Weighted Pick, Rate Limit Aware, Least Busy, Latency Based

 Router provides 4 strategies for routing your calls across multiple deployments: 
@ -278,6 +278,36 @@ router_settings:
 	routing_strategy_args: {"ttl": 10}
 ```

+### Set Lowest Latency Buffer
+
+Set a buffer within which deployments are candidates for making calls to. 
+
+E.g. 
+
+if you have 5 deployments
+
+```
+https://litellm-prod-1.openai.azure.com/: 0.07s
+https://litellm-prod-2.openai.azure.com/: 0.1s
+https://litellm-prod-3.openai.azure.com/: 0.1s
+https://litellm-prod-4.openai.azure.com/: 0.1s
+https://litellm-prod-5.openai.azure.com/: 4.66s
+```
+
+to prevent initially overloading `prod-1`, with all requests - we can set a buffer of 50%, to consider deployments `prod-2, prod-3, prod-4`. 
+
+**In Router**
+```python 
+router = Router(..., routing_strategy_args={"lowest_latency_buffer": 0.5})
+```
+
+**In Proxy**
+
+```yaml
+router_settings:
+	routing_strategy_args: {"lowest_latency_buffer": 0.5}
+```
+
 </TabItem>
 <TabItem value="simple-shuffle" label="(Default) Weighted Pick (Async)">

@ -443,6 +473,35 @@ asyncio.run(router_acompletion())

 ## Basic Reliability

+### Max Parallel Requests (ASYNC)
+
+Used in semaphore for async requests on router. Limit the max concurrent calls made to a deployment. Useful in high-traffic scenarios. 
+
+If tpm/rpm is set, and no max parallel request limit given, we use the RPM or calculated RPM (tpm/1000/6) as the max parallel request limit. 
+
+
+```python
+from litellm import Router 
+
+model_list = [{
+	"model_name": "gpt-4",
+	"litellm_params": {
+		"model": "azure/gpt-4",
+		...
+		"max_parallel_requests": 10 # 👈 SET PER DEPLOYMENT
+	}
+}]
+
+### OR ### 
+
+router = Router(model_list=model_list, default_max_parallel_requests=20) # 👈 SET DEFAULT MAX PARALLEL REQUESTS 
+
+
+# deployment max parallel requests > default max parallel requests
+```
+
+[**See Code**](https://github.com/BerriAI/litellm/blob/a978f2d8813c04dad34802cb95e0a0e35a3324bc/litellm/utils.py#L5605)
+
 ### Timeouts 

 The timeout set in router is for the entire length of the call, and is passed down to the completion() call level as well. 
--- a/docs/my-website/docs/set_keys.md
+++ b/docs/my-website/docs/set_keys.md
@ -5,6 +5,9 @@ LiteLLM allows you to specify the following:
 * API Base
 * API Version
 * API Type
+* Project
+* Location
+* Token

 Useful Helper functions: 
 * [`check_valid_key()`](#check_valid_key)
@ -43,6 +46,24 @@ os.environ['AZURE_API_TYPE'] = "azure" # [OPTIONAL]
 os.environ['OPENAI_API_BASE'] = "https://openai-gpt-4-test2-v-12.openai.azure.com/"
 ```

+### Setting Project, Location, Token
+
+For cloud providers:
+- Azure
+- Bedrock
+- GCP
+- Watson AI 
+
+you might need to set additional parameters. LiteLLM provides a common set of params, that we map across all providers. 
+
+|      | LiteLLM param | Watson       | Vertex AI    | Azure        | Bedrock      |
+|------|--------------|--------------|--------------|--------------|--------------|
+| Project | project | watsonx_project | vertex_project | n/a | n/a |
+| Region | region_name | watsonx_region_name | vertex_location | n/a | aws_region_name |
+| Token | token | watsonx_token or token | n/a | azure_ad_token | n/a |
+
+If you want, you can call them by their provider-specific params as well. 
+
 ## litellm variables

 ### litellm.api_key
--- a/docs/my-website/img/openmeter.png
+++ b/docs/my-website/img/openmeter.png
--- a/docs/my-website/img/openmeter_img_2.png
+++ b/docs/my-website/img/openmeter_img_2.png
--- a/docs/my-website/sidebars.js
+++ b/docs/my-website/sidebars.js
@ -43,6 +43,12 @@ const sidebars = {
        "proxy/user_keys",
        "proxy/enterprise",
        "proxy/virtual_keys",
+        "proxy/alerting",
+        {
+          type: "category",
+          label: "Logging",
+          items: ["proxy/logging", "proxy/streaming_logging"],
+        },
        "proxy/team_based_routing",
        "proxy/ui",
        "proxy/cost_tracking",
@ -58,11 +64,6 @@ const sidebars = {
        "proxy/pii_masking",
        "proxy/prompt_injection",
        "proxy/caching",
-        {
-          type: "category",
-          label: "Logging, Alerting",
-          items: ["proxy/logging", "proxy/alerting", "proxy/streaming_logging"],
-        },
        "proxy/prometheus",
        "proxy/call_hooks",
        "proxy/rules",
@ -148,6 +149,7 @@ const sidebars = {
        "providers/openrouter", 
        "providers/custom_openai_proxy",
        "providers/petals",
+        "providers/watsonx",
      ],
    },
    "proxy/custom_pricing",
@ -168,6 +170,7 @@ const sidebars = {
        "observability/custom_callback",
        "observability/langfuse_integration",
        "observability/sentry",
+        "observability/openmeter",
        "observability/promptlayer_integration",
        "observability/wandb_integration",
        "observability/langsmith_integration",
@ -175,7 +178,6 @@ const sidebars = {
        "observability/traceloop_integration",
        "observability/athina_integration",
        "observability/lunary_integration",
-        "observability/athina_integration",
        "observability/helicone_integration",
        "observability/supabase_integration",
        `observability/telemetry`,
--- a/litellm-js/spend-logs/package-lock.json
+++ b/litellm-js/spend-logs/package-lock.json
@ -6,7 +6,7 @@
    "": {
      "dependencies": {
        "@hono/node-server": "^1.9.0",
-        "hono": "^4.1.5"
+        "hono": "^4.2.7"
      },
      "devDependencies": {
        "@types/node": "^20.11.17",
@ -463,9 +463,9 @@
      }
    },
    "node_modules/hono": {
-      "version": "4.1.5",
-      "resolved": "https://registry.npmjs.org/hono/-/hono-4.1.5.tgz",
-      "integrity": "sha512-3ChJiIoeCxvkt6vnkxJagplrt1YZg3NyNob7ssVeK2PUqEINp4q1F94HzFnvY9QE8asVmbW5kkTDlyWylfg2vg==",
+      "version": "4.2.7",
+      "resolved": "https://registry.npmjs.org/hono/-/hono-4.2.7.tgz",
+      "integrity": "sha512-k1xHi86tJnRIVvqhFMBDGFKJ8r5O+bEsT4P59ZK59r0F300Xd910/r237inVfuT/VmE86RQQffX4OYNda6dLXw==",
      "engines": {
        "node": ">=16.0.0"
      }
--- a/litellm-js/spend-logs/package.json
+++ b/litellm-js/spend-logs/package.json
@ -4,7 +4,7 @@
  },
  "dependencies": {
    "@hono/node-server": "^1.9.0",
-    "hono": "^4.1.5"
+    "hono": "^4.2.7"
  },
  "devDependencies": {
    "@types/node": "^20.11.17",
--- a/litellm/init.py
+++ b/litellm/init.py
@ -2,7 +2,7 @@
 import threading, requests, os
 from typing import Callable, List, Optional, Dict, Union, Any, Literal
 from litellm.caching import Cache
-from litellm._logging import set_verbose, _turn_on_debug, verbose_logger
+from litellm._logging import set_verbose, _turn_on_debug, verbose_logger, json_logs
 from litellm.proxy._types import (
    KeyManagementSystem,
    KeyManagementSettings,
@ -22,6 +22,7 @@ success_callback: List[Union[str, Callable]] = []
 failure_callback: List[Union[str, Callable]] = []
 service_callback: List[Union[str, Callable]] = []
 callbacks: List[Callable] = []
+_custom_logger_compatible_callbacks: list = ["openmeter"]
 _langfuse_default_tags: Optional[
    List[
        Literal[
@ -45,6 +46,7 @@ _async_failure_callback: List[Callable] = (
 )  # internal variable - async custom callbacks are routed here.
 pre_call_rules: List[Callable] = []
 post_call_rules: List[Callable] = []
+turn_off_message_logging: Optional[bool] = False
 ## end of callbacks #############

 email: Optional[str] = (
@ -58,6 +60,7 @@ max_tokens = 256  # OpenAI Defaults
 drop_params = False
 modify_params = False
 retry = True
+### AUTH ###
 api_key: Optional[str] = None
 openai_key: Optional[str] = None
 azure_key: Optional[str] = None
@ -76,7 +79,12 @@ cloudflare_api_key: Optional[str] = None
 baseten_key: Optional[str] = None
 aleph_alpha_key: Optional[str] = None
 nlp_cloud_key: Optional[str] = None
+common_cloud_provider_auth_params: dict = {
+    "params": ["project", "region_name", "token"],
+    "providers": ["vertex_ai", "bedrock", "watsonx", "azure"],
+}
 use_client: bool = False
+ssl_verify: bool = True
 disable_streaming_logging: bool = False
 ### GUARDRAILS ###
 llamaguard_model_name: Optional[str] = None
@ -298,6 +306,7 @@ aleph_alpha_models: List = []
 bedrock_models: List = []
 deepinfra_models: List = []
 perplexity_models: List = []
+watsonx_models: List = []
 for key, value in model_cost.items():
    if value.get("litellm_provider") == "openai":
        open_ai_chat_completion_models.append(key)
@ -342,6 +351,8 @@ for key, value in model_cost.items():
        deepinfra_models.append(key)
    elif value.get("litellm_provider") == "perplexity":
        perplexity_models.append(key)
+    elif value.get("litellm_provider") == "watsonx":
+        watsonx_models.append(key)

 # known openai compatible endpoints - we'll eventually move this list to the model_prices_and_context_window.json dictionary
 openai_compatible_endpoints: List = [
@ -478,6 +489,7 @@ model_list = (
    + perplexity_models
    + maritalk_models
    + vertex_language_models
+    + watsonx_models
 )

 provider_list: List = [
@ -516,6 +528,7 @@ provider_list: List = [
    "cloudflare",
    "xinference",
    "fireworks_ai",
+    "watsonx",
    "custom",  # custom apis
 ]

@ -537,6 +550,7 @@ models_by_provider: dict = {
    "deepinfra": deepinfra_models,
    "perplexity": perplexity_models,
    "maritalk": maritalk_models,
+    "watsonx": watsonx_models,
 }

 # mapping for those models which have larger equivalents
@ -647,9 +661,11 @@ from .llms.bedrock import (
    AmazonLlamaConfig,
    AmazonStabilityConfig,
    AmazonMistralConfig,
+    AmazonBedrockGlobalConfig,
 )
 from .llms.openai import OpenAIConfig, OpenAITextCompletionConfig
 from .llms.azure import AzureOpenAIConfig, AzureOpenAIError
+from .llms.watsonx import IBMWatsonXAIConfig
 from .main import *  # type: ignore
 from .integrations import *
 from .exceptions import (
--- a/litellm/_logging.py
+++ b/litellm/_logging.py
@ -1,7 +1,7 @@
 import logging

 set_verbose = False
-
+json_logs = False
 # Create a handler for the logger (you may need to adapt this based on your needs)
 handler = logging.StreamHandler()
 handler.setLevel(logging.DEBUG)
--- a/litellm/integrations/langfuse.py
+++ b/litellm/integrations/langfuse.py
@ -12,9 +12,12 @@ import litellm

 class LangFuseLogger:
    # Class variables or attributes
-    def __init__(self, langfuse_public_key=None, langfuse_secret=None):
+    def __init__(
+        self, langfuse_public_key=None, langfuse_secret=None, flush_interval=1
+    ):
        try:
            from langfuse import Langfuse
+            import langfuse
        except Exception as e:
            raise Exception(
                f"\033[91mLangfuse not installed, try running 'pip install langfuse' to fix this error: {e}\n{traceback.format_exc()}\033[0m"
@ -25,14 +28,20 @@ class LangFuseLogger:
        self.langfuse_host = os.getenv("LANGFUSE_HOST", "https://cloud.langfuse.com")
        self.langfuse_release = os.getenv("LANGFUSE_RELEASE")
        self.langfuse_debug = os.getenv("LANGFUSE_DEBUG")
-        self.Langfuse = Langfuse(
-            public_key=self.public_key,
-            secret_key=self.secret_key,
-            host=self.langfuse_host,
-            release=self.langfuse_release,
-            debug=self.langfuse_debug,
-            flush_interval=1,  # flush interval in seconds
-        )
+
+        parameters = {
+            "public_key": self.public_key,
+            "secret_key": self.secret_key,
+            "host": self.langfuse_host,
+            "release": self.langfuse_release,
+            "debug": self.langfuse_debug,
+            "flush_interval": flush_interval,  # flush interval in seconds
+        }
+
+        if Version(langfuse.version.__version__) >= Version("2.6.0"):
+            parameters["sdk_integration"] = "litellm"
+
+        self.Langfuse = Langfuse(**parameters)

        # set the current langfuse project id in the environ
        # this is used by Alerting to link to the correct project
@ -77,13 +86,14 @@ class LangFuseLogger:
        print_verbose,
        level="DEFAULT",
        status_message=None,
-    ):
+    ) -> dict:
        # Method definition

        try:
            print_verbose(
                f"Langfuse Logging - Enters logging function for model {kwargs}"
            )
+
            litellm_params = kwargs.get("litellm_params", {})
            metadata = (
                litellm_params.get("metadata", {}) or {}
@ -137,8 +147,10 @@ class LangFuseLogger:
                input = prompt
                output = response_obj["data"]
            print_verbose(f"OUTPUT IN LANGFUSE: {output}; original: {response_obj}")
+            trace_id = None
+            generation_id = None
            if self._is_langfuse_v2():
-                self._log_langfuse_v2(
+                trace_id, generation_id = self._log_langfuse_v2(
                    user_id,
                    metadata,
                    litellm_params,
@ -168,10 +180,12 @@ class LangFuseLogger:
                f"Langfuse Layer Logging - final response object: {response_obj}"
            )
            verbose_logger.info(f"Langfuse Layer Logging - logging success")
+
+            return {"trace_id": trace_id, "generation_id": generation_id}
        except:
            traceback.print_exc()
            verbose_logger.debug(f"Langfuse Layer Error - {traceback.format_exc()}")
-            pass
+            return {"trace_id": None, "generation_id": None}

    async def _async_log_event(
        self, kwargs, response_obj, start_time, end_time, user_id, print_verbose
@ -243,7 +257,7 @@ class LangFuseLogger:
        response_obj,
        level,
        print_verbose,
-    ):
+    ) -> tuple:
        import langfuse

        try:
@ -262,15 +276,21 @@ class LangFuseLogger:
                tags = metadata_tags

            trace_name = metadata.get("trace_name", None)
-            if trace_name is None:
+            trace_id = metadata.get("trace_id", None)
+            existing_trace_id = metadata.get("existing_trace_id", None)
+            if trace_name is None and existing_trace_id is None:
                # just log `litellm-{call_type}` as the trace name
+                ## DO NOT SET TRACE_NAME if trace-id set. this can lead to overwriting of past traces.
                trace_name = f"litellm-{kwargs.get('call_type', 'completion')}"

+            if existing_trace_id is not None:
+                trace_params = {"id": existing_trace_id}
+            else:  # don't overwrite an existing trace
                trace_params = {
                    "name": trace_name,
                    "input": input,
                    "user_id": metadata.get("trace_user_id", user_id),
-                "id": metadata.get("trace_id", None),
+                    "id": trace_id,
                    "session_id": metadata.get("session_id", None),
                }

@ -335,6 +355,7 @@ class LangFuseLogger:
                        kwargs["cache_hit"] = False
                    tags.append(f"cache_hit:{kwargs['cache_hit']}")
                    clean_metadata["cache_hit"] = kwargs["cache_hit"]
+                if existing_trace_id is None:
                    trace_params.update({"tags": tags})

            proxy_server_request = litellm_params.get("proxy_server_request", None)
@ -355,8 +376,6 @@ class LangFuseLogger:
                    "headers": clean_headers,
                }

-            print_verbose(f"trace_params: {trace_params}")
-
            trace = self.Langfuse.trace(**trace_params)

            generation_id = None
@ -373,7 +392,11 @@ class LangFuseLogger:
                # just log `litellm-{call_type}` as the generation name
                generation_name = f"litellm-{kwargs.get('call_type', 'completion')}"

+            if response_obj is not None and "system_fingerprint" in response_obj:
                system_fingerprint = response_obj.get("system_fingerprint", None)
+            else:
+                system_fingerprint = None
+
            if system_fingerprint is not None:
                optional_params["system_fingerprint"] = system_fingerprint

@ -402,8 +425,9 @@ class LangFuseLogger:
                    "completion_start_time", None
                )

-            print_verbose(f"generation_params: {generation_params}")
+            generation_client = trace.generation(**generation_params)
            
-            trace.generation(**generation_params)
+            return generation_client.trace_id, generation_id
        except Exception as e:
            verbose_logger.debug(f"Langfuse Layer Error - {traceback.format_exc()}")
+            return None, None
--- a/litellm/integrations/langsmith.py
+++ b/litellm/integrations/langsmith.py
@ -73,10 +73,6 @@ class LangsmithLogger:
                elif type(value) != dict and is_serializable(value=value):
                    new_kwargs[key] = value

-            print(f"type of response: {type(response_obj)}")
-            for k, v in new_kwargs.items():
-                print(f"key={k}, type of arg: {type(v)}, value={v}")
-
            if isinstance(response_obj, BaseModel):
                try:
                    response_obj = response_obj.model_dump()
--- a/litellm/integrations/openmeter.py
+++ b/litellm/integrations/openmeter.py
@ -0,0 +1,123 @@
+# What is this?
+## On Success events log cost to OpenMeter - https://github.com/BerriAI/litellm/issues/1268
+
+import dotenv, os, json
+import requests
+import litellm
+
+dotenv.load_dotenv()  # Loading env variables using dotenv
+import traceback
+from litellm.integrations.custom_logger import CustomLogger
+from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
+import uuid
+
+
+def get_utc_datetime():
+    import datetime as dt
+    from datetime import datetime
+
+    if hasattr(dt, "UTC"):
+        return datetime.now(dt.UTC)  # type: ignore
+    else:
+        return datetime.utcnow()  # type: ignore
+
+
+class OpenMeterLogger(CustomLogger):
+    def __init__(self) -> None:
+        super().__init__()
+        self.validate_environment()
+        self.async_http_handler = AsyncHTTPHandler()
+        self.sync_http_handler = HTTPHandler()
+
+    def validate_environment(self):
+        """
+        Expects
+        OPENMETER_API_ENDPOINT,
+        OPENMETER_API_KEY,
+
+        in the environment
+        """
+        missing_keys = []
+        if litellm.get_secret("OPENMETER_API_KEY", None) is None:
+            missing_keys.append("OPENMETER_API_KEY")
+
+        if len(missing_keys) > 0:
+            raise Exception("Missing keys={} in environment.".format(missing_keys))
+
+    def _common_logic(self, kwargs: dict, response_obj):
+        call_id = response_obj.get("id", kwargs.get("litellm_call_id"))
+        dt = get_utc_datetime().isoformat()
+        cost = kwargs.get("response_cost", None)
+        model = kwargs.get("model")
+        usage = {}
+        if (
+            isinstance(response_obj, litellm.ModelResponse)
+            or isinstance(response_obj, litellm.EmbeddingResponse)
+        ) and hasattr(response_obj, "usage"):
+            usage = {
+                "prompt_tokens": response_obj["usage"].get("prompt_tokens", 0),
+                "completion_tokens": response_obj["usage"].get("completion_tokens", 0),
+                "total_tokens": response_obj["usage"].get("total_tokens"),
+            }
+
+        return {
+            "specversion": "1.0",
+            "type": os.getenv("OPENMETER_EVENT_TYPE", "litellm_tokens"),
+            "id": call_id,
+            "time": dt,
+            "subject": kwargs.get("user", ""),  # end-user passed in via 'user' param
+            "source": "litellm-proxy",
+            "data": {"model": model, "cost": cost, **usage},
+        }
+
+    def log_success_event(self, kwargs, response_obj, start_time, end_time):
+        _url = litellm.get_secret(
+            "OPENMETER_API_ENDPOINT", default_value="https://openmeter.cloud"
+        )
+        if _url.endswith("/"):
+            _url += "api/v1/events"
+        else:
+            _url += "/api/v1/events"
+
+        api_key = litellm.get_secret("OPENMETER_API_KEY")
+
+        _data = self._common_logic(kwargs=kwargs, response_obj=response_obj)
+        self.sync_http_handler.post(
+            url=_url,
+            data=_data,
+            headers={
+                "Content-Type": "application/cloudevents+json",
+                "Authorization": "Bearer {}".format(api_key),
+            },
+        )
+
+    async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
+        _url = litellm.get_secret(
+            "OPENMETER_API_ENDPOINT", default_value="https://openmeter.cloud"
+        )
+        if _url.endswith("/"):
+            _url += "api/v1/events"
+        else:
+            _url += "/api/v1/events"
+
+        api_key = litellm.get_secret("OPENMETER_API_KEY")
+
+        _data = self._common_logic(kwargs=kwargs, response_obj=response_obj)
+        _headers = {
+            "Content-Type": "application/cloudevents+json",
+            "Authorization": "Bearer {}".format(api_key),
+        }
+
+        try:
+            response = await self.async_http_handler.post(
+                url=_url,
+                data=json.dumps(_data),
+                headers=_headers,
+            )
+
+            response.raise_for_status()
+        except Exception as e:
+            print(f"\nAn Exception Occurred - {str(e)}")
+            if hasattr(response, "text"):
+                print(f"\nError Message: {response.text}")
+            raise e
--- a/litellm/integrations/slack_alerting.py
+++ b/litellm/integrations/slack_alerting.py
@ -7,11 +7,12 @@ import copy
 import traceback
 from litellm._logging import verbose_logger, verbose_proxy_logger
 import litellm
-from typing import List, Literal, Any, Union, Optional
+from typing import List, Literal, Any, Union, Optional, Dict
 from litellm.caching import DualCache
 import asyncio
 import aiohttp
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
+import datetime


 class SlackAlerting:
@ -37,12 +38,28 @@ class SlackAlerting:
            "budget_alerts",
            "db_exceptions",
        ],
+        alert_to_webhook_url: Optional[
+            Dict
+        ] = None,  # if user wants to separate alerts to diff channels
    ):
        self.alerting_threshold = alerting_threshold
        self.alerting = alerting
        self.alert_types = alert_types
        self.internal_usage_cache = DualCache()
        self.async_http_handler = AsyncHTTPHandler()
+        self.alert_to_webhook_url = alert_to_webhook_url
+        self.langfuse_logger = None
+
+        try:
+            from litellm.integrations.langfuse import LangFuseLogger
+
+            self.langfuse_logger = LangFuseLogger(
+                os.getenv("LANGFUSE_PUBLIC_KEY"),
+                os.getenv("LANGFUSE_SECRET_KEY"),
+                flush_interval=1,
+            )
+        except:
+            pass

        pass

@ -51,6 +68,7 @@ class SlackAlerting:
        alerting: Optional[List] = None,
        alerting_threshold: Optional[float] = None,
        alert_types: Optional[List] = None,
+        alert_to_webhook_url: Optional[Dict] = None,
    ):
        if alerting is not None:
            self.alerting = alerting
@ -59,6 +77,13 @@ class SlackAlerting:
        if alert_types is not None:
            self.alert_types = alert_types

+        if alert_to_webhook_url is not None:
+            # update the dict
+            if self.alert_to_webhook_url is None:
+                self.alert_to_webhook_url = alert_to_webhook_url
+            else:
+                self.alert_to_webhook_url.update(alert_to_webhook_url)
+
    async def deployment_in_cooldown(self):
        pass

@ -81,39 +106,68 @@ class SlackAlerting:
        request_info: str,
        request_data: Optional[dict] = None,
        kwargs: Optional[dict] = None,
+        type: Literal["hanging_request", "slow_response"] = "hanging_request",
+        start_time: Optional[datetime.datetime] = None,
+        end_time: Optional[datetime.datetime] = None,
    ):
        import uuid

        # For now: do nothing as we're debugging why this is not working as expected
+        if request_data is not None:
+            trace_id = request_data.get("metadata", {}).get(
+                "trace_id", None
+            )  # get langfuse trace id
+            if trace_id is None:
+                trace_id = "litellm-alert-trace-" + str(uuid.uuid4())
+                request_data["metadata"]["trace_id"] = trace_id
+        elif kwargs is not None:
+            _litellm_params = kwargs.get("litellm_params", {})
+            trace_id = _litellm_params.get("metadata", {}).get(
+                "trace_id", None
+            )  # get langfuse trace id
+            if trace_id is None:
+                trace_id = "litellm-alert-trace-" + str(uuid.uuid4())
+                _litellm_params["metadata"]["trace_id"] = trace_id
+
+        # Log hanging request as an error on langfuse
+        if type == "hanging_request":
+            if self.langfuse_logger is not None:
+                _logging_kwargs = copy.deepcopy(request_data)
+                if _logging_kwargs is None:
+                    _logging_kwargs = {}
+                _logging_kwargs["litellm_params"] = {}
+                request_data = request_data or {}
+                _logging_kwargs["litellm_params"]["metadata"] = request_data.get(
+                    "metadata", {}
+                )
+                # log to langfuse in a separate thread
+                import threading
+
+                threading.Thread(
+                    target=self.langfuse_logger.log_event,
+                    args=(
+                        _logging_kwargs,
+                        None,
+                        start_time,
+                        end_time,
+                        None,
+                        print,
+                        "ERROR",
+                        "Requests is hanging",
+                    ),
+                ).start()
+
+        _langfuse_host = os.environ.get("LANGFUSE_HOST", "https://cloud.langfuse.com")
+        _langfuse_project_id = os.environ.get("LANGFUSE_PROJECT_ID")
+
+        # langfuse urls look like: https://us.cloud.langfuse.com/project/************/traces/litellm-alert-trace-ididi9dk-09292-************
+
+        _langfuse_url = (
+            f"{_langfuse_host}/project/{_langfuse_project_id}/traces/{trace_id}"
+        )
+        request_info += f"\n🪢 Langfuse Trace: {_langfuse_url}"
        return request_info

-        # if request_data is not None:
-        #     trace_id = request_data.get("metadata", {}).get(
-        #         "trace_id", None
-        #     )  # get langfuse trace id
-        #     if trace_id is None:
-        #         trace_id = "litellm-alert-trace-" + str(uuid.uuid4())
-        #         request_data["metadata"]["trace_id"] = trace_id
-        # elif kwargs is not None:
-        #     _litellm_params = kwargs.get("litellm_params", {})
-        #     trace_id = _litellm_params.get("metadata", {}).get(
-        #         "trace_id", None
-        #     )  # get langfuse trace id
-        #     if trace_id is None:
-        #         trace_id = "litellm-alert-trace-" + str(uuid.uuid4())
-        #         _litellm_params["metadata"]["trace_id"] = trace_id
-
-        # _langfuse_host = os.environ.get("LANGFUSE_HOST", "https://cloud.langfuse.com")
-        # _langfuse_project_id = os.environ.get("LANGFUSE_PROJECT_ID")
-
-        # # langfuse urls look like: https://us.cloud.langfuse.com/project/************/traces/litellm-alert-trace-ididi9dk-09292-************
-
-        # _langfuse_url = (
-        #     f"{_langfuse_host}/project/{_langfuse_project_id}/traces/{trace_id}"
-        # )
-        # request_info += f"\n🪢 Langfuse Trace: {_langfuse_url}"
-        # return request_info
-
    def _response_taking_too_long_callback(
        self,
        kwargs,  # kwargs to completion
@ -140,7 +194,6 @@ class SlackAlerting:
            raise e

    def _get_deployment_latencies_to_alert(self, metadata=None):
-
        if metadata is None:
            return None

@ -156,6 +209,14 @@ class SlackAlerting:
            _deployment_latencies = metadata["_latency_per_deployment"]
            if len(_deployment_latencies) == 0:
                return None
+            try:
+                # try sorting deployments by latency
+                _deployment_latencies = sorted(
+                    _deployment_latencies.items(), key=lambda x: x[1]
+                )
+                _deployment_latencies = dict(_deployment_latencies)
+            except:
+                pass
            for api_base, latency in _deployment_latencies.items():
                _message_to_send += f"\n{api_base}: {round(latency,2)}s"
            _message_to_send = "```" + _message_to_send + "```"
@ -171,8 +232,6 @@ class SlackAlerting:
        if self.alerting is None or self.alert_types is None:
            return

-        if "llm_too_slow" not in self.alert_types:
-            return
        time_difference_float, model, api_base, messages = (
            self._response_taking_too_long_callback(
                kwargs=kwargs,
@ -185,7 +244,7 @@ class SlackAlerting:
        if time_difference_float > self.alerting_threshold:
            if "langfuse" in litellm.success_callback:
                request_info = self._add_langfuse_trace_id_to_alert(
-                    request_info=request_info, kwargs=kwargs
+                    request_info=request_info, kwargs=kwargs, type="slow_response"
                )
            # add deployment latencies to alert
            if (
@ -205,6 +264,7 @@ class SlackAlerting:
            await self.send_alert(
                message=slow_message + request_info,
                level="Low",
+                alert_type="llm_too_slow",
            )

    async def log_failure_event(self, original_exception: Exception):
@ -212,8 +272,8 @@ class SlackAlerting:

    async def response_taking_too_long(
        self,
-        start_time: Optional[float] = None,
-        end_time: Optional[float] = None,
+        start_time: Optional[datetime.datetime] = None,
+        end_time: Optional[datetime.datetime] = None,
        type: Literal["hanging_request", "slow_response"] = "hanging_request",
        request_data: Optional[dict] = None,
    ):
@ -233,17 +293,10 @@ class SlackAlerting:
            except:
                messages = ""
            request_info = f"\nRequest Model: `{model}`\nMessages: `{messages}`"
-            if "langfuse" in litellm.success_callback:
-                request_info = self._add_langfuse_trace_id_to_alert(
-                    request_info=request_info, request_data=request_data
-                )
        else:
            request_info = ""

        if type == "hanging_request":
-            # Simulate a long-running operation that could take more than 5 minutes
-            if "llm_requests_hanging" not in self.alert_types:
-                return
            await asyncio.sleep(
                self.alerting_threshold
            )  # Set it to 5 minutes - i'd imagine this might be different for streaming, non-streaming, non-completion (embedding + img) requests
@ -281,6 +334,15 @@ class SlackAlerting:
                    f"`Requests are hanging - {self.alerting_threshold}s+ request time`"
                )

+                if "langfuse" in litellm.success_callback:
+                    request_info = self._add_langfuse_trace_id_to_alert(
+                        request_info=request_info,
+                        request_data=request_data,
+                        type="hanging_request",
+                        start_time=start_time,
+                        end_time=end_time,
+                    )
+
                # add deployment latencies to alert
                _deployment_latency_map = self._get_deployment_latencies_to_alert(
                    metadata=request_data.get("metadata", {})
@ -291,6 +353,7 @@ class SlackAlerting:
                await self.send_alert(
                    message=alerting_message + request_info,
                    level="Medium",
+                    alert_type="llm_requests_hanging",
                )

    async def budget_alerts(
@ -336,8 +399,7 @@ class SlackAlerting:
            user_info = f"\nUser ID: {user_id}\n Error {error_message}"
            message = "Failed Tracking Cost for" + user_info
            await self.send_alert(
-                message=message,
-                level="High",
+                message=message, level="High", alert_type="budget_alerts"
            )
            return
        elif type == "projected_limit_exceeded" and user_info is not None:
@ -353,8 +415,7 @@ class SlackAlerting:
            """
            message = f"""\n🚨 `ProjectedLimitExceededError` 💸\n\n`Key Alias:` {user_info["key_alias"]} \n`Expected Day of Error`: {user_info["projected_exceeded_date"]} \n`Current Spend`: {user_current_spend} \n`Projected Spend at end of month`: {user_info["projected_spend"]} \n`Soft Limit`: {user_max_budget}"""
            await self.send_alert(
-                message=message,
-                level="High",
+                message=message, level="High", alert_type="budget_alerts"
            )
            return
        else:
@ -382,8 +443,7 @@ class SlackAlerting:
            result = await _cache.async_get_cache(key=message)
            if result is None:
                await self.send_alert(
-                    message=message,
-                    level="High",
+                    message=message, level="High", alert_type="budget_alerts"
                )
                await _cache.async_set_cache(key=message, value="SENT", ttl=2419200)
            return
@ -395,8 +455,7 @@ class SlackAlerting:
            result = await _cache.async_get_cache(key=cache_key)
            if result is None:
                await self.send_alert(
-                    message=message,
-                    level="Medium",
+                    message=message, level="Medium", alert_type="budget_alerts"
                )

                await _cache.async_set_cache(key=cache_key, value="SENT", ttl=2419200)
@ -409,15 +468,25 @@ class SlackAlerting:
            result = await _cache.async_get_cache(key=message)
            if result is None:
                await self.send_alert(
-                    message=message,
-                    level="Low",
+                    message=message, level="Low", alert_type="budget_alerts"
                )
                await _cache.async_set_cache(key=message, value="SENT", ttl=2419200)
            return

        return

-    async def send_alert(self, message: str, level: Literal["Low", "Medium", "High"]):
+    async def send_alert(
+        self,
+        message: str,
+        level: Literal["Low", "Medium", "High"],
+        alert_type: Literal[
+            "llm_exceptions",
+            "llm_too_slow",
+            "llm_requests_hanging",
+            "budget_alerts",
+            "db_exceptions",
+        ],
+    ):
        """
        Alerting based on thresholds: - https://github.com/BerriAI/litellm/issues/1298

@ -432,12 +501,6 @@ class SlackAlerting:
            level: str - Low|Medium|High - if calls might fail (Medium) or are failing (High); Currently, no alerts would be 'Low'.
            message: str - what is the alert about
        """
-        print(
-            "inside send alert for slack, message: ",
-            message,
-            "self.alerting: ",
-            self.alerting,
-        )
        if self.alerting is None:
            return

@ -453,7 +516,15 @@ class SlackAlerting:
        if _proxy_base_url is not None:
            formatted_message += f"\n\nProxy URL: `{_proxy_base_url}`"

+        # check if we find the slack webhook url in self.alert_to_webhook_url
+        if (
+            self.alert_to_webhook_url is not None
+            and alert_type in self.alert_to_webhook_url
+        ):
+            slack_webhook_url = self.alert_to_webhook_url[alert_type]
+        else:
            slack_webhook_url = os.getenv("SLACK_WEBHOOK_URL", None)
+
        if slack_webhook_url is None:
            raise Exception("Missing SLACK_WEBHOOK_URL from environment")
        payload = {"text": formatted_message}
--- a/litellm/llms/azure.py
+++ b/litellm/llms/azure.py
@ -96,6 +96,15 @@ class AzureOpenAIConfig(OpenAIConfig):
            top_p,
        )

+    def get_mapped_special_auth_params(self) -> dict:
+        return {"token": "azure_ad_token"}
+
+    def map_special_auth_params(self, non_default_params: dict, optional_params: dict):
+        for param, value in non_default_params.items():
+            if param == "token":
+                optional_params["azure_ad_token"] = value
+        return optional_params
+

 def select_azure_base_url_or_endpoint(azure_client_params: dict):
    # azure_client_params = {
--- a/litellm/llms/bedrock.py
+++ b/litellm/llms/bedrock.py
@ -29,6 +29,24 @@ class BedrockError(Exception):
        )  # Call the base class constructor with the parameters it needs


+class AmazonBedrockGlobalConfig:
+    def __init__(self):
+        pass
+
+    def get_mapped_special_auth_params(self) -> dict:
+        """
+        Mapping of common auth params across bedrock/vertex/azure/watsonx
+        """
+        return {"region_name": "aws_region_name"}
+
+    def map_special_auth_params(self, non_default_params: dict, optional_params: dict):
+        mapped_params = self.get_mapped_special_auth_params()
+        for param, value in non_default_params.items():
+            if param in mapped_params:
+                optional_params[mapped_params[param]] = value
+        return optional_params
+
+
 class AmazonTitanConfig:
    """
    Reference: https://us-west-2.console.aws.amazon.com/bedrock/home?region=us-west-2#/providers?model=titan-text-express-v1
@ -666,6 +684,10 @@ def convert_messages_to_prompt(model, messages, provider, custom_prompt_dict):
        prompt = prompt_factory(
            model=model, messages=messages, custom_llm_provider="bedrock"
        )
+    elif provider == "meta":
+        prompt = prompt_factory(
+            model=model, messages=messages, custom_llm_provider="bedrock"
+        )
    else:
        prompt = ""
        for message in messages:
@ -945,7 +967,7 @@ def completion(
            original_response=json.dumps(response_body),
            additional_args={"complete_input_dict": data},
        )
-        print_verbose(f"raw model_response: {response}")
+        print_verbose(f"raw model_response: {response_body}")
        ## RESPONSE OBJECT
        outputText = "default"
        if provider == "ai21":
@ -1058,6 +1080,7 @@ def completion(
            outputText = response_body.get("results")[0].get("outputText")

        response_metadata = response.get("ResponseMetadata", {})
+
        if response_metadata.get("HTTPStatusCode", 500) >= 400:
            raise BedrockError(
                message=outputText,
@ -1093,11 +1116,13 @@ def completion(
            prompt_tokens = response_metadata.get(
                "x-amzn-bedrock-input-token-count", len(encoding.encode(prompt))
            )
+            _text_response = model_response["choices"][0]["message"].get("content", "")
            completion_tokens = response_metadata.get(
                "x-amzn-bedrock-output-token-count",
                len(
                    encoding.encode(
-                        model_response["choices"][0]["message"].get("content", "")
+                        _text_response,
+                        disallowed_special=(),
                    )
                ),
            )
--- a/litellm/llms/ollama.py
+++ b/litellm/llms/ollama.py
@ -213,12 +213,13 @@ def get_ollama_response(
    ## RESPONSE OBJECT
    model_response["choices"][0]["finish_reason"] = "stop"
    if optional_params.get("format", "") == "json":
+        function_call = json.loads(response_json["response"])
        message = litellm.Message(
            content=None,
            tool_calls=[
                {
                    "id": f"call_{str(uuid.uuid4())}",
-                    "function": {"arguments": response_json["response"], "name": ""},
+                    "function": {"name": function_call["name"], "arguments": json.dumps(function_call["arguments"])},
                    "type": "function",
                }
            ],
@ -310,15 +311,13 @@ async def ollama_acompletion(url, data, model_response, encoding, logging_obj):
            ## RESPONSE OBJECT
            model_response["choices"][0]["finish_reason"] = "stop"
            if data.get("format", "") == "json":
+                function_call = json.loads(response_json["response"])
                message = litellm.Message(
                    content=None,
                    tool_calls=[
                        {
                            "id": f"call_{str(uuid.uuid4())}",
-                            "function": {
-                                "arguments": response_json["response"],
-                                "name": "",
-                            },
+                            "function": {"name": function_call["name"], "arguments": json.dumps(function_call["arguments"])},
                            "type": "function",
                        }
                    ],
--- a/litellm/llms/ollama_chat.py
+++ b/litellm/llms/ollama_chat.py
@ -285,15 +285,13 @@ def get_ollama_response(
    ## RESPONSE OBJECT
    model_response["choices"][0]["finish_reason"] = "stop"
    if data.get("format", "") == "json":
+        function_call = json.loads(response_json["message"]["content"])
        message = litellm.Message(
            content=None,
            tool_calls=[
                {
                    "id": f"call_{str(uuid.uuid4())}",
-                    "function": {
-                        "arguments": response_json["message"]["content"],
-                        "name": "",
-                    },
+                    "function": {"name": function_call["name"], "arguments": json.dumps(function_call["arguments"])},
                    "type": "function",
                }
            ],
@ -415,15 +413,13 @@ async def ollama_acompletion(
            ## RESPONSE OBJECT
            model_response["choices"][0]["finish_reason"] = "stop"
            if data.get("format", "") == "json":
+                function_call = json.loads(response_json["message"]["content"])
                message = litellm.Message(
                    content=None,
                    tool_calls=[
                        {
                            "id": f"call_{str(uuid.uuid4())}",
-                            "function": {
-                                "arguments": response_json["message"]["content"],
-                                "name": function_name or "",
-                            },
+                            "function": {"name": function_call["name"], "arguments": json.dumps(function_call["arguments"])},
                            "type": "function",
                        }
                    ],
--- a/litellm/llms/openai.py
+++ b/litellm/llms/openai.py
@ -447,6 +447,7 @@ class OpenAIChatCompletion(BaseLLM):
                )
            else:
                openai_aclient = client
+
            ## LOGGING
            logging_obj.pre_call(
                input=data["messages"],
--- a/litellm/llms/prompt_templates/factory.py
+++ b/litellm/llms/prompt_templates/factory.py
@ -3,8 +3,14 @@ import requests, traceback
 import json, re, xml.etree.ElementTree as ET
 from jinja2 import Template, exceptions, meta, BaseLoader
 from jinja2.sandbox import ImmutableSandboxedEnvironment
-from typing import Optional, Any
-from typing import List
+from typing import (
+    Any,
+    List,
+    Mapping,
+    MutableMapping,
+    Optional,
+    Sequence,
+)
 import litellm


@ -431,6 +437,35 @@ def format_prompt_togetherai(messages, prompt_format, chat_template):
    return prompt


+### IBM Granite
+
+
+def ibm_granite_pt(messages: list):
+    """
+    IBM's Granite models uses the template:
+    <|system|> {system_message} <|user|> {user_message} <|assistant|> {assistant_message}
+
+    See: https://www.ibm.com/docs/en/watsonx-as-a-service?topic=solutions-supported-foundation-models
+    """
+    return custom_prompt(
+        messages=messages,
+        role_dict={
+            "system": {
+                "pre_message": "<|system|>\n",
+                "post_message": "\n",
+            },
+            "user": {
+                "pre_message": "<|user|>\n",
+                "post_message": "\n",
+            },
+            "assistant": {
+                "pre_message": "<|assistant|>\n",
+                "post_message": "\n",
+            },
+        },
+    ).strip()
+
+
 ### ANTHROPIC ###


@ -1017,6 +1052,30 @@ def get_system_prompt(messages):
    return system_prompt, messages


+def convert_to_documents(
+    observations: Any,
+) -> List[MutableMapping]:
+    """Converts observations into a 'document' dict"""
+    documents: List[MutableMapping] = []
+    if isinstance(observations, str):
+        # strings are turned into a key/value pair and a key of 'output' is added.
+        observations = [{"output": observations}]
+    elif isinstance(observations, Mapping):
+        # single mappings are transformed into a list to simplify the rest of the code.
+        observations = [observations]
+    elif not isinstance(observations, Sequence):
+        # all other types are turned into a key/value pair within a list
+        observations = [{"output": observations}]
+
+    for doc in observations:
+        if not isinstance(doc, Mapping):
+            # types that aren't Mapping are turned into a key/value pair.
+            doc = {"output": doc}
+        documents.append(doc)
+
+    return documents
+
+
 def convert_openai_message_to_cohere_tool_result(message):
    """
    OpenAI message with a tool result looks like:
@ -1058,7 +1117,7 @@ def convert_openai_message_to_cohere_tool_result(message):
            "parameters": {"location": "San Francisco, CA"},
            "generation_id": tool_call_id,
        },
-        "outputs": [content],
+        "outputs": convert_to_documents(content),
    }
    return cohere_tool_result

@ -1071,7 +1130,7 @@ def cohere_message_pt(messages: list):
        if message["role"] == "tool":
            tool_result = convert_openai_message_to_cohere_tool_result(message)
            tool_results.append(tool_result)
-        else:
+        elif message.get("content"):
            prompt += message["content"] + "\n\n"
    prompt = prompt.rstrip()
    return prompt, tool_results
@ -1346,12 +1405,47 @@ def prompt_factory(
                return anthropic_pt(messages=messages)
        elif "mistral." in model:
            return mistral_instruct_pt(messages=messages)
+        elif "llama2" in model and "chat" in model:
+            return llama_2_chat_pt(messages=messages)
+        elif "llama3" in model and "instruct" in model:
+            return hf_chat_template(
+                model="meta-llama/Meta-Llama-3-8B-Instruct",
+                messages=messages,
+            )
    elif custom_llm_provider == "perplexity":
        for message in messages:
            message.pop("name", None)
        return messages
    elif custom_llm_provider == "azure_text":
        return azure_text_pt(messages=messages)
+    elif custom_llm_provider == "watsonx":
+        if "granite" in model and "chat" in model:
+            # granite-13b-chat-v1 and granite-13b-chat-v2 use a specific prompt template
+            return ibm_granite_pt(messages=messages)
+        elif "ibm-mistral" in model and "instruct" in model:
+            # models like ibm-mistral/mixtral-8x7b-instruct-v01-q use the mistral instruct prompt template
+            return mistral_instruct_pt(messages=messages)
+        elif "meta-llama/llama-3" in model and "instruct" in model:
+            # https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3/
+            return custom_prompt(
+                role_dict={
+                    "system": {
+                        "pre_message": "<|start_header_id|>system<|end_header_id|>\n",
+                        "post_message": "<|eot_id|>",
+                    },
+                    "user": {
+                        "pre_message": "<|start_header_id|>user<|end_header_id|>\n",
+                        "post_message": "<|eot_id|>",
+                    },
+                    "assistant": {
+                        "pre_message": "<|start_header_id|>assistant<|end_header_id|>\n",
+                        "post_message": "<|eot_id|>",
+                    },
+                },
+                messages=messages,
+                initial_prompt_value="<|begin_of_text|>",
+                final_prompt_value="<|start_header_id|>assistant<|end_header_id|>\n",
+            )
    try:
        if "meta-llama/llama-2" in model and "chat" in model:
            return llama_2_chat_pt(messages=messages)
@ -1359,11 +1453,8 @@ def prompt_factory(
            "meta-llama/llama-3" in model or "meta-llama-3" in model
        ) and "instruct" in model:
            return hf_chat_template(
-                model=model,
+                model="meta-llama/Meta-Llama-3-8B-Instruct",
                messages=messages,
-                chat_template=known_tokenizer_config[  # type: ignore
-                    "meta-llama/Meta-Llama-3-8B-Instruct"
-                ]["tokenizer"]["chat_template"],
            )
        elif (
            "tiiuae/falcon" in model
--- a/litellm/llms/replicate.py
+++ b/litellm/llms/replicate.py
@ -112,10 +112,16 @@ def start_prediction(
    }

    initial_prediction_data = {
-        "version": version_id,
        "input": input_data,
    }

+    if ":" in version_id and len(version_id) > 64:
+        model_parts = version_id.split(":")
+        if (
+            len(model_parts) > 1 and len(model_parts[1]) == 64
+        ):  ## checks if model name has a 64 digit code - e.g. "meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3"
+            initial_prediction_data["version"] = model_parts[1]
+
    ## LOGGING
    logging_obj.pre_call(
        input=input_data["prompt"],
--- a/litellm/llms/vertex_ai.py
+++ b/litellm/llms/vertex_ai.py
@ -143,7 +143,9 @@ class VertexAIConfig:
                optional_params["temperature"] = value
            if param == "top_p":
                optional_params["top_p"] = value
-            if param == "stream":
+            if (
+                param == "stream" and value == True
+            ):  # sending stream = False, can cause it to get passed unchecked and raise issues
                optional_params["stream"] = value
            if param == "n":
                optional_params["candidate_count"] = value
@ -182,6 +184,20 @@ class VertexAIConfig:
                pass
        return optional_params

+    def get_mapped_special_auth_params(self) -> dict:
+        """
+        Common auth params across bedrock/vertex_ai/azure/watsonx
+        """
+        return {"project": "vertex_project", "region_name": "vertex_location"}
+
+    def map_special_auth_params(self, non_default_params: dict, optional_params: dict):
+        mapped_params = self.get_mapped_special_auth_params()
+
+        for param, value in non_default_params.items():
+            if param in mapped_params:
+                optional_params[mapped_params[param]] = value
+        return optional_params
+

 import asyncio

@ -527,6 +543,7 @@ def completion(
                "instances": instances,
                "vertex_location": vertex_location,
                "vertex_project": vertex_project,
+                "safety_settings": safety_settings,
                **optional_params,
            }
            if optional_params.get("stream", False) is True:
@ -541,8 +558,9 @@ def completion(
            tools = optional_params.pop("tools", None)
            prompt, images = _gemini_vision_convert_messages(messages=messages)
            content = [prompt] + images
-            if "stream" in optional_params and optional_params["stream"] == True:
-                stream = optional_params.pop("stream")
+            stream = optional_params.pop("stream", False)
+            if stream == True:
+
                request_str += f"response = llm_model.generate_content({content}, generation_config=GenerationConfig(**{optional_params}), safety_settings={safety_settings}, stream={stream})\n"
                logging_obj.pre_call(
                    input=prompt,
@ -810,6 +828,7 @@ async def async_completion(
    instances=None,
    vertex_project=None,
    vertex_location=None,
+    safety_settings=None,
    **optional_params,
 ):
    """
@ -820,6 +839,7 @@ async def async_completion(
            print_verbose("\nMaking VertexAI Gemini Pro/Vision Call")
            print_verbose(f"\nProcessing input messages = {messages}")
            tools = optional_params.pop("tools", None)
+            stream = optional_params.pop("stream", False)

            prompt, images = _gemini_vision_convert_messages(messages=messages)
            content = [prompt] + images
@ -840,6 +860,7 @@ async def async_completion(
            response = await llm_model._generate_content_async(
                contents=content,
                generation_config=optional_params,
+                safety_settings=safety_settings,
                tools=tools,
            )

@ -1018,6 +1039,7 @@ async def async_streaming(
    instances=None,
    vertex_project=None,
    vertex_location=None,
+    safety_settings=None,
    **optional_params,
 ):
    """
@ -1044,6 +1066,7 @@ async def async_streaming(
        response = await llm_model._generate_content_streaming_async(
            contents=content,
            generation_config=optional_params,
+            safety_settings=safety_settings,
            tools=tools,
        )

--- a/litellm/llms/watsonx.py
+++ b/litellm/llms/watsonx.py
@ -0,0 +1,609 @@
+from enum import Enum
+import json, types, time  # noqa: E401
+from contextlib import contextmanager
+from typing import Callable, Dict, Optional, Any, Union, List
+
+import httpx
+import requests
+import litellm
+from litellm.utils import ModelResponse, get_secret, Usage
+
+from .base import BaseLLM
+from .prompt_templates import factory as ptf
+
+
+class WatsonXAIError(Exception):
+    def __init__(self, status_code, message, url: Optional[str] = None):
+        self.status_code = status_code
+        self.message = message
+        url = url or "https://https://us-south.ml.cloud.ibm.com"
+        self.request = httpx.Request(method="POST", url=url)
+        self.response = httpx.Response(status_code=status_code, request=self.request)
+        super().__init__(
+            self.message
+        )  # Call the base class constructor with the parameters it needs
+
+
+class IBMWatsonXAIConfig:
+    """
+    Reference: https://cloud.ibm.com/apidocs/watsonx-ai#text-generation
+    (See ibm_watsonx_ai.metanames.GenTextParamsMetaNames for a list of all available params)
+
+    Supported params for all available watsonx.ai foundational models.
+
+    - `decoding_method` (str): One of "greedy" or "sample"
+
+    - `temperature` (float): Sets the model temperature for sampling - not available when decoding_method='greedy'.
+
+    - `max_new_tokens` (integer): Maximum length of the generated tokens.
+
+    - `min_new_tokens` (integer): Maximum length of input tokens. Any more than this will be truncated.
+
+    - `length_penalty` (dict): A dictionary with keys "decay_factor" and "start_index".
+
+    - `stop_sequences` (string[]): list of strings to use as stop sequences.
+
+    - `top_k` (integer): top k for sampling - not available when decoding_method='greedy'.
+
+    - `top_p` (integer): top p for sampling - not available when decoding_method='greedy'.
+
+    - `repetition_penalty` (float): token repetition penalty during text generation.
+
+    - `truncate_input_tokens` (integer): Truncate input tokens to this length.
+
+    - `include_stop_sequences` (bool): If True, the stop sequence will be included at the end of the generated text in the case of a match.
+
+    - `return_options` (dict): A dictionary of options to return. Options include "input_text", "generated_tokens", "input_tokens", "token_ranks". Values are boolean.
+
+    - `random_seed` (integer): Random seed for text generation.
+
+    - `moderations` (dict): Dictionary of properties that control the moderations, for usages such as Hate and profanity (HAP) and PII filtering.
+
+    - `stream` (bool): If True, the model will return a stream of responses.
+    """
+
+    decoding_method: Optional[str] = "sample"
+    temperature: Optional[float] = None
+    max_new_tokens: Optional[int] = None  # litellm.max_tokens
+    min_new_tokens: Optional[int] = None
+    length_penalty: Optional[dict] = None  # e.g {"decay_factor": 2.5, "start_index": 5}
+    stop_sequences: Optional[List[str]] = None  # e.g ["}", ")", "."]
+    top_k: Optional[int] = None
+    top_p: Optional[float] = None
+    repetition_penalty: Optional[float] = None
+    truncate_input_tokens: Optional[int] = None
+    include_stop_sequences: Optional[bool] = False
+    return_options: Optional[Dict[str, bool]] = None
+    random_seed: Optional[int] = None  # e.g 42
+    moderations: Optional[dict] = None
+    stream: Optional[bool] = False
+
+    def __init__(
+        self,
+        decoding_method: Optional[str] = None,
+        temperature: Optional[float] = None,
+        max_new_tokens: Optional[int] = None,
+        min_new_tokens: Optional[int] = None,
+        length_penalty: Optional[dict] = None,
+        stop_sequences: Optional[List[str]] = None,
+        top_k: Optional[int] = None,
+        top_p: Optional[float] = None,
+        repetition_penalty: Optional[float] = None,
+        truncate_input_tokens: Optional[int] = None,
+        include_stop_sequences: Optional[bool] = None,
+        return_options: Optional[dict] = None,
+        random_seed: Optional[int] = None,
+        moderations: Optional[dict] = None,
+        stream: Optional[bool] = None,
+        **kwargs,
+    ) -> None:
+        locals_ = locals()
+        for key, value in locals_.items():
+            if key != "self" and value is not None:
+                setattr(self.__class__, key, value)
+
+    @classmethod
+    def get_config(cls):
+        return {
+            k: v
+            for k, v in cls.__dict__.items()
+            if not k.startswith("__")
+            and not isinstance(
+                v,
+                (
+                    types.FunctionType,
+                    types.BuiltinFunctionType,
+                    classmethod,
+                    staticmethod,
+                ),
+            )
+            and v is not None
+        }
+
+    def get_supported_openai_params(self):
+        return [
+            "temperature",  # equivalent to temperature
+            "max_tokens",  # equivalent to max_new_tokens
+            "top_p",  # equivalent to top_p
+            "frequency_penalty",  # equivalent to repetition_penalty
+            "stop",  # equivalent to stop_sequences
+            "seed",  # equivalent to random_seed
+            "stream",  # equivalent to stream
+        ]
+
+    def get_mapped_special_auth_params(self) -> dict:
+        """
+        Common auth params across bedrock/vertex_ai/azure/watsonx
+        """
+        return {
+            "project": "watsonx_project",
+            "region_name": "watsonx_region_name",
+            "token": "watsonx_token",
+        }
+
+    def map_special_auth_params(self, non_default_params: dict, optional_params: dict):
+        mapped_params = self.get_mapped_special_auth_params()
+
+        for param, value in non_default_params.items():
+            if param in mapped_params:
+                optional_params[mapped_params[param]] = value
+        return optional_params
+
+
+def convert_messages_to_prompt(model, messages, provider, custom_prompt_dict):
+    # handle anthropic prompts and amazon titan prompts
+    if model in custom_prompt_dict:
+        # check if the model has a registered custom prompt
+        model_prompt_dict = custom_prompt_dict[model]
+        prompt = ptf.custom_prompt(
+            messages=messages,
+            role_dict=model_prompt_dict.get(
+                "role_dict", model_prompt_dict.get("roles")
+            ),
+            initial_prompt_value=model_prompt_dict.get("initial_prompt_value", ""),
+            final_prompt_value=model_prompt_dict.get("final_prompt_value", ""),
+            bos_token=model_prompt_dict.get("bos_token", ""),
+            eos_token=model_prompt_dict.get("eos_token", ""),
+        )
+        return prompt
+    elif provider == "ibm":
+        prompt = ptf.prompt_factory(
+            model=model, messages=messages, custom_llm_provider="watsonx"
+        )
+    elif provider == "ibm-mistralai":
+        prompt = ptf.mistral_instruct_pt(messages=messages)
+    else:
+        prompt = ptf.prompt_factory(
+            model=model, messages=messages, custom_llm_provider="watsonx"
+        )
+    return prompt
+
+
+class WatsonXAIEndpoint(str, Enum):
+    TEXT_GENERATION = "/ml/v1/text/generation"
+    TEXT_GENERATION_STREAM = "/ml/v1/text/generation_stream"
+    DEPLOYMENT_TEXT_GENERATION = "/ml/v1/deployments/{deployment_id}/text/generation"
+    DEPLOYMENT_TEXT_GENERATION_STREAM = (
+        "/ml/v1/deployments/{deployment_id}/text/generation_stream"
+    )
+    EMBEDDINGS = "/ml/v1/text/embeddings"
+    PROMPTS = "/ml/v1/prompts"
+
+
+class IBMWatsonXAI(BaseLLM):
+    """
+    Class to interface with IBM Watsonx.ai API for text generation and embeddings.
+
+    Reference: https://cloud.ibm.com/apidocs/watsonx-ai
+    """
+
+    api_version = "2024-03-13"
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def _prepare_text_generation_req(
+        self,
+        model_id: str,
+        prompt: str,
+        stream: bool,
+        optional_params: dict,
+        print_verbose: Optional[Callable] = None,
+    ) -> dict:
+        """
+        Get the request parameters for text generation.
+        """
+        api_params = self._get_api_params(optional_params, print_verbose=print_verbose)
+        # build auth headers
+        api_token = api_params.get("token")
+
+        headers = {
+            "Authorization": f"Bearer {api_token}",
+            "Content-Type": "application/json",
+            "Accept": "application/json",
+        }
+        extra_body_params = optional_params.pop("extra_body", {})
+        optional_params.update(extra_body_params)
+        # init the payload to the text generation call
+        payload = {
+            "input": prompt,
+            "moderations": optional_params.pop("moderations", {}),
+            "parameters": optional_params,
+        }
+        request_params = dict(version=api_params["api_version"])
+        # text generation endpoint deployment or model / stream or not
+        if model_id.startswith("deployment/"):
+            # deployment models are passed in as 'deployment/<deployment_id>'
+            if api_params.get("space_id") is None:
+                raise WatsonXAIError(
+                    status_code=401,
+                    url=api_params["url"],
+                    message="Error: space_id is required for models called using the 'deployment/' endpoint. Pass in the space_id as a parameter or set it in the WX_SPACE_ID environment variable.",
+                )
+            deployment_id = "/".join(model_id.split("/")[1:])
+            endpoint = (
+                WatsonXAIEndpoint.DEPLOYMENT_TEXT_GENERATION_STREAM.value
+                if stream
+                else WatsonXAIEndpoint.DEPLOYMENT_TEXT_GENERATION.value
+            )
+            endpoint = endpoint.format(deployment_id=deployment_id)
+        else:
+            payload["model_id"] = model_id
+            payload["project_id"] = api_params["project_id"]
+            endpoint = (
+                WatsonXAIEndpoint.TEXT_GENERATION_STREAM
+                if stream
+                else WatsonXAIEndpoint.TEXT_GENERATION
+            )
+        url = api_params["url"].rstrip("/") + endpoint
+        return dict(
+            method="POST", url=url, headers=headers, json=payload, params=request_params
+        )
+
+    def _get_api_params(
+        self, params: dict, print_verbose: Optional[Callable] = None
+    ) -> dict:
+        """
+        Find watsonx.ai credentials in the params or environment variables and return the headers for authentication.
+        """
+        # Load auth variables from params
+        url = params.pop("url", params.pop("api_base", params.pop("base_url", None)))
+        api_key = params.pop("apikey", None)
+        token = params.pop("token", None)
+        project_id = params.pop(
+            "project_id", params.pop("watsonx_project", None)
+        )  # watsonx.ai project_id - allow 'watsonx_project' to be consistent with how vertex project implementation works -> reduce provider-specific params
+        space_id = params.pop("space_id", None)  # watsonx.ai deployment space_id
+        region_name = params.pop("region_name", params.pop("region", None))
+        if region_name is None:
+            region_name = params.pop(
+                "watsonx_region_name", params.pop("watsonx_region", None)
+            )  # consistent with how vertex ai + aws regions are accepted
+        wx_credentials = params.pop(
+            "wx_credentials",
+            params.pop(
+                "watsonx_credentials", None
+            ),  # follow {provider}_credentials, same as vertex ai
+        )
+        api_version = params.pop("api_version", IBMWatsonXAI.api_version)
+        # Load auth variables from environment variables
+        if url is None:
+            url = (
+                get_secret("WATSONX_API_BASE")  # consistent with 'AZURE_API_BASE'
+                or get_secret("WATSONX_URL")
+                or get_secret("WX_URL")
+                or get_secret("WML_URL")
+            )
+        if api_key is None:
+            api_key = (
+                get_secret("WATSONX_APIKEY")
+                or get_secret("WATSONX_API_KEY")
+                or get_secret("WX_API_KEY")
+            )
+        if token is None:
+            token = get_secret("WATSONX_TOKEN") or get_secret("WX_TOKEN")
+        if project_id is None:
+            project_id = (
+                get_secret("WATSONX_PROJECT_ID")
+                or get_secret("WX_PROJECT_ID")
+                or get_secret("PROJECT_ID")
+            )
+        if region_name is None:
+            region_name = (
+                get_secret("WATSONX_REGION")
+                or get_secret("WX_REGION")
+                or get_secret("REGION")
+            )
+        if space_id is None:
+            space_id = (
+                get_secret("WATSONX_DEPLOYMENT_SPACE_ID")
+                or get_secret("WATSONX_SPACE_ID")
+                or get_secret("WX_SPACE_ID")
+                or get_secret("SPACE_ID")
+            )
+
+        # credentials parsing
+        if wx_credentials is not None:
+            url = wx_credentials.get("url", url)
+            api_key = wx_credentials.get(
+                "apikey", wx_credentials.get("api_key", api_key)
+            )
+            token = wx_credentials.get(
+                "token",
+                wx_credentials.get(
+                    "watsonx_token", token
+                ),  # follow format of {provider}_token, same as azure - e.g. 'azure_ad_token=..'
+            )
+
+        # verify that all required credentials are present
+        if url is None:
+            raise WatsonXAIError(
+                status_code=401,
+                message="Error: Watsonx URL not set. Set WX_URL in environment variables or pass in as a parameter.",
+            )
+        if token is None and api_key is not None:
+            # generate the auth token
+            if print_verbose:
+                print_verbose("Generating IAM token for Watsonx.ai")
+            token = self.generate_iam_token(api_key)
+        elif token is None and api_key is None:
+            raise WatsonXAIError(
+                status_code=401,
+                url=url,
+                message="Error: API key or token not found. Set WX_API_KEY or WX_TOKEN in environment variables or pass in as a parameter.",
+            )
+        if project_id is None:
+            raise WatsonXAIError(
+                status_code=401,
+                url=url,
+                message="Error: Watsonx project_id not set. Set WX_PROJECT_ID in environment variables or pass in as a parameter.",
+            )
+
+        return {
+            "url": url,
+            "api_key": api_key,
+            "token": token,
+            "project_id": project_id,
+            "space_id": space_id,
+            "region_name": region_name,
+            "api_version": api_version,
+        }
+
+    def completion(
+        self,
+        model: str,
+        messages: list,
+        custom_prompt_dict: dict,
+        model_response: ModelResponse,
+        print_verbose: Callable,
+        encoding,
+        logging_obj,
+        optional_params: dict,
+        litellm_params: Optional[dict] = None,
+        logger_fn=None,
+        timeout: Optional[float] = None,
+    ):
+        """
+        Send a text generation request to the IBM Watsonx.ai API.
+        Reference: https://cloud.ibm.com/apidocs/watsonx-ai#text-generation
+        """
+        stream = optional_params.pop("stream", False)
+
+        # Load default configs
+        config = IBMWatsonXAIConfig.get_config()
+        for k, v in config.items():
+            if k not in optional_params:
+                optional_params[k] = v
+
+        # Make prompt to send to model
+        provider = model.split("/")[0]
+        # model_name = "/".join(model.split("/")[1:])
+        prompt = convert_messages_to_prompt(
+            model, messages, provider, custom_prompt_dict
+        )
+
+        def process_text_request(request_params: dict) -> ModelResponse:
+            with self._manage_response(
+                request_params, logging_obj=logging_obj, input=prompt, timeout=timeout
+            ) as resp:
+                json_resp = resp.json()
+
+            generated_text = json_resp["results"][0]["generated_text"]
+            prompt_tokens = json_resp["results"][0]["input_token_count"]
+            completion_tokens = json_resp["results"][0]["generated_token_count"]
+            model_response["choices"][0]["message"]["content"] = generated_text
+            model_response["finish_reason"] = json_resp["results"][0]["stop_reason"]
+            model_response["created"] = int(time.time())
+            model_response["model"] = model
+            setattr(
+                model_response,
+                "usage",
+                Usage(
+                    prompt_tokens=prompt_tokens,
+                    completion_tokens=completion_tokens,
+                    total_tokens=prompt_tokens + completion_tokens,
+                ),
+            )
+            return model_response
+
+        def process_stream_request(
+            request_params: dict,
+        ) -> litellm.CustomStreamWrapper:
+            # stream the response - generated chunks will be handled
+            # by litellm.utils.CustomStreamWrapper.handle_watsonx_stream
+            with self._manage_response(
+                request_params,
+                logging_obj=logging_obj,
+                stream=True,
+                input=prompt,
+                timeout=timeout,
+            ) as resp:
+                response = litellm.CustomStreamWrapper(
+                    resp.iter_lines(),
+                    model=model,
+                    custom_llm_provider="watsonx",
+                    logging_obj=logging_obj,
+                )
+            return response
+
+        try:
+            ## Get the response from the model
+            req_params = self._prepare_text_generation_req(
+                model_id=model,
+                prompt=prompt,
+                stream=stream,
+                optional_params=optional_params,
+                print_verbose=print_verbose,
+            )
+            if stream:
+                return process_stream_request(req_params)
+            else:
+                return process_text_request(req_params)
+        except WatsonXAIError as e:
+            raise e
+        except Exception as e:
+            raise WatsonXAIError(status_code=500, message=str(e))
+
+    def embedding(
+        self,
+        model: str,
+        input: Union[list, str],
+        api_key: Optional[str] = None,
+        logging_obj=None,
+        model_response=None,
+        optional_params=None,
+        encoding=None,
+    ):
+        """
+        Send a text embedding request to the IBM Watsonx.ai API.
+        """
+        if optional_params is None:
+            optional_params = {}
+        # Load default configs
+        config = IBMWatsonXAIConfig.get_config()
+        for k, v in config.items():
+            if k not in optional_params:
+                optional_params[k] = v
+
+        # Load auth variables from environment variables
+        if isinstance(input, str):
+            input = [input]
+        if api_key is not None:
+            optional_params["api_key"] = api_key
+        api_params = self._get_api_params(optional_params)
+        # build auth headers
+        api_token = api_params.get("token")
+        headers = {
+            "Authorization": f"Bearer {api_token}",
+            "Content-Type": "application/json",
+            "Accept": "application/json",
+        }
+        # init the payload to the text generation call
+        payload = {
+            "inputs": input,
+            "model_id": model,
+            "project_id": api_params["project_id"],
+            "parameters": optional_params,
+        }
+        request_params = dict(version=api_params["api_version"])
+        url = api_params["url"].rstrip("/") + WatsonXAIEndpoint.EMBEDDINGS
+        # request = httpx.Request(
+        #     "POST", url, headers=headers, json=payload, params=request_params
+        # )
+        req_params = {
+            "method": "POST",
+            "url": url,
+            "headers": headers,
+            "json": payload,
+            "params": request_params,
+        }
+        with self._manage_response(
+            req_params, logging_obj=logging_obj, input=input
+        ) as resp:
+            json_resp = resp.json()
+
+        results = json_resp.get("results", [])
+        embedding_response = []
+        for idx, result in enumerate(results):
+            embedding_response.append(
+                {"object": "embedding", "index": idx, "embedding": result["embedding"]}
+            )
+        model_response["object"] = "list"
+        model_response["data"] = embedding_response
+        model_response["model"] = model
+        input_tokens = json_resp.get("input_token_count", 0)
+        model_response.usage = Usage(
+            prompt_tokens=input_tokens, completion_tokens=0, total_tokens=input_tokens
+        )
+        return model_response
+
+    def generate_iam_token(self, api_key=None, **params):
+        headers = {}
+        headers["Content-Type"] = "application/x-www-form-urlencoded"
+        if api_key is None:
+            api_key = get_secret("WX_API_KEY") or get_secret("WATSONX_API_KEY")
+        if api_key is None:
+            raise ValueError("API key is required")
+        headers["Accept"] = "application/json"
+        data = {
+            "grant_type": "urn:ibm:params:oauth:grant-type:apikey",
+            "apikey": api_key,
+        }
+        response = httpx.post(
+            "https://iam.cloud.ibm.com/identity/token", data=data, headers=headers
+        )
+        response.raise_for_status()
+        json_data = response.json()
+        iam_access_token = json_data["access_token"]
+        self.token = iam_access_token
+        return iam_access_token
+
+    @contextmanager
+    def _manage_response(
+        self,
+        request_params: dict,
+        logging_obj: Any,
+        stream: bool = False,
+        input: Optional[Any] = None,
+        timeout: Optional[float] = None,
+    ):
+        request_str = (
+            f"response = {request_params['method']}(\n"
+            f"\turl={request_params['url']},\n"
+            f"\tjson={request_params['json']},\n"
+            f")"
+        )
+        logging_obj.pre_call(
+            input=input,
+            api_key=request_params["headers"].get("Authorization"),
+            additional_args={
+                "complete_input_dict": request_params["json"],
+                "request_str": request_str,
+            },
+        )
+        if timeout:
+            request_params["timeout"] = timeout
+        try:
+            if stream:
+                resp = requests.request(
+                    **request_params,
+                    stream=True,
+                )
+                resp.raise_for_status()
+                yield resp
+            else:
+                resp = requests.request(**request_params)
+                resp.raise_for_status()
+                yield resp
+        except Exception as e:
+            raise WatsonXAIError(status_code=500, message=str(e))
+        if not stream:
+            logging_obj.post_call(
+                input=input,
+                api_key=request_params["headers"].get("Authorization"),
+                original_response=json.dumps(resp.json()),
+                additional_args={
+                    "status_code": resp.status_code,
+                    "complete_input_dict": request_params["json"],
+                },
+            )
--- a/litellm/main.py
+++ b/litellm/main.py
@ -63,6 +63,7 @@ from .llms import (
    vertex_ai,
    vertex_ai_anthropic,
    maritalk,
+    watsonx,
 )
 from .llms.openai import OpenAIChatCompletion, OpenAITextCompletion
 from .llms.azure import AzureChatCompletion
@ -360,7 +361,7 @@ def mock_completion(
    model: str,
    messages: List,
    stream: Optional[bool] = False,
-    mock_response: str = "This is a mock request",
+    mock_response: Union[str, Exception] = "This is a mock request",
    logging=None,
    **kwargs,
 ):
@ -387,6 +388,20 @@ def mock_completion(
        - If 'stream' is True, it returns a response that mimics the behavior of a streaming completion.
    """
    try:
+        ## LOGGING
+        if logging is not None:
+            logging.pre_call(
+                input=messages,
+                api_key="mock-key",
+            )
+        if isinstance(mock_response, Exception):
+            raise litellm.APIError(
+                status_code=500,  # type: ignore
+                message=str(mock_response),
+                llm_provider="openai",  # type: ignore
+                model=model,  # type: ignore
+                request=httpx.Request(method="POST", url="https://api.openai.com/v1/"),
+            )
        model_response = ModelResponse(stream=stream)
        if stream is True:
            # don't try to access stream object,
@ -1864,6 +1879,43 @@ def completion(

            ## RESPONSE OBJECT
            response = response
+        elif custom_llm_provider == "watsonx":
+            custom_prompt_dict = custom_prompt_dict or litellm.custom_prompt_dict
+            response = watsonx.IBMWatsonXAI().completion(
+                model=model,
+                messages=messages,
+                custom_prompt_dict=custom_prompt_dict,
+                model_response=model_response,
+                print_verbose=print_verbose,
+                optional_params=optional_params,
+                litellm_params=litellm_params,  # type: ignore
+                logger_fn=logger_fn,
+                encoding=encoding,
+                logging_obj=logging,
+                timeout=timeout,
+            )
+            if (
+                "stream" in optional_params
+                and optional_params["stream"] == True
+                and not isinstance(response, CustomStreamWrapper)
+            ):
+                # don't try to access stream object,
+                response = CustomStreamWrapper(
+                    iter(response),
+                    model,
+                    custom_llm_provider="watsonx",
+                    logging_obj=logging,
+                )
+
+            if optional_params.get("stream", False):
+                ## LOGGING
+                logging.post_call(
+                    input=messages,
+                    api_key=None,
+                    original_response=response,
+                )
+            ## RESPONSE OBJECT
+            response = response
        elif custom_llm_provider == "vllm":
            custom_prompt_dict = custom_prompt_dict or litellm.custom_prompt_dict
            model_response = vllm.completion(
@ -2943,6 +2995,15 @@ def embedding(
                client=client,
                aembedding=aembedding,
            )
+        elif custom_llm_provider == "watsonx":
+            response = watsonx.IBMWatsonXAI().embedding(
+                model=model,
+                input=input,
+                encoding=encoding,
+                logging_obj=logging,
+                optional_params=optional_params,
+                model_response=EmbeddingResponse(),
+            )
        else:
            args = locals()
            raise ValueError(f"No valid embedding model args passed in - {args}")
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@ -1418,6 +1418,123 @@
        "litellm_provider": "replicate",
        "mode": "chat"
    },
+    "replicate/meta/llama-2-13b": {
+        "max_tokens": 4096,
+        "max_input_tokens": 4096,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.0000001,
+        "output_cost_per_token": 0.0000005,
+        "litellm_provider": "replicate",
+        "mode": "chat"
+    },
+    "replicate/meta/llama-2-13b-chat": {
+        "max_tokens": 4096,
+        "max_input_tokens": 4096,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.0000001,
+        "output_cost_per_token": 0.0000005,
+        "litellm_provider": "replicate",
+        "mode": "chat"
+    },
+    "replicate/meta/llama-2-70b": {
+        "max_tokens": 4096,
+        "max_input_tokens": 4096,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.00000065,
+        "output_cost_per_token": 0.00000275,
+        "litellm_provider": "replicate",
+        "mode": "chat"
+    },
+    "replicate/meta/llama-2-70b-chat": {
+        "max_tokens": 4096,
+        "max_input_tokens": 4096,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.00000065,
+        "output_cost_per_token": 0.00000275,
+        "litellm_provider": "replicate",
+        "mode": "chat"
+    },
+    "replicate/meta/llama-2-7b": {
+        "max_tokens": 4096,
+        "max_input_tokens": 4096,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.00000005,
+        "output_cost_per_token": 0.00000025,
+        "litellm_provider": "replicate",
+        "mode": "chat"
+    },
+    "replicate/meta/llama-2-7b-chat": {
+        "max_tokens": 4096,
+        "max_input_tokens": 4096,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.00000005,
+        "output_cost_per_token": 0.00000025,
+        "litellm_provider": "replicate",
+        "mode": "chat"
+    },
+    "replicate/meta/llama-3-70b": {
+        "max_tokens": 4096,
+        "max_input_tokens": 4096,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.00000065,
+        "output_cost_per_token": 0.00000275,
+        "litellm_provider": "replicate",
+        "mode": "chat"
+    },
+    "replicate/meta/llama-3-70b-instruct": {
+        "max_tokens": 4096,
+        "max_input_tokens": 4096,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.00000065,
+        "output_cost_per_token": 0.00000275,
+        "litellm_provider": "replicate",
+        "mode": "chat"
+    },
+    "replicate/meta/llama-3-8b": {
+        "max_tokens": 4096,
+        "max_input_tokens": 4096,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.00000005,
+        "output_cost_per_token": 0.00000025,
+        "litellm_provider": "replicate",
+        "mode": "chat"
+    },
+    "replicate/meta/llama-3-8b-instruct": {
+        "max_tokens": 4096,
+        "max_input_tokens": 4096,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.00000005,
+        "output_cost_per_token": 0.00000025,
+        "litellm_provider": "replicate",
+        "mode": "chat"
+    },
+    "replicate/mistralai/mistral-7b-v0.1": {
+        "max_tokens": 4096,
+        "max_input_tokens": 4096,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.00000005,
+        "output_cost_per_token": 0.00000025,
+        "litellm_provider": "replicate",
+        "mode": "chat"
+    },
+    "replicate/mistralai/mistral-7b-instruct-v0.2": {
+        "max_tokens": 4096,
+        "max_input_tokens": 4096,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.00000005,
+        "output_cost_per_token": 0.00000025,
+        "litellm_provider": "replicate",
+        "mode": "chat"
+    },
+    "replicate/mistralai/mixtral-8x7b-instruct-v0.1": {
+        "max_tokens": 4096,
+        "max_input_tokens": 4096,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.0000003,
+        "output_cost_per_token": 0.000001,
+        "litellm_provider": "replicate",
+        "mode": "chat"
+    },
    "openrouter/openai/gpt-3.5-turbo": {
        "max_tokens": 4095,
        "input_cost_per_token": 0.0000015,
@ -1455,6 +1572,17 @@
        "litellm_provider": "openrouter",
        "mode": "chat"
    },
+    "openrouter/anthropic/claude-3-opus": {
+        "max_tokens": 4096,
+        "max_input_tokens": 200000,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.000015,
+        "output_cost_per_token": 0.000075,
+        "litellm_provider": "openrouter",
+        "mode": "chat",
+        "supports_function_calling": true,
+        "tool_use_system_prompt_tokens": 395
+    },
    "openrouter/google/palm-2-chat-bison": {
        "max_tokens": 8000,
        "input_cost_per_token": 0.0000005,
@ -2379,6 +2507,24 @@
        "litellm_provider": "bedrock",
        "mode": "chat"
    },
+    "meta.llama3-8b-instruct-v1:0": {
+        "max_tokens": 8192, 
+        "max_input_tokens": 8192, 
+        "max_output_tokens": 8192, 
+        "input_cost_per_token": 0.0000004,
+        "output_cost_per_token": 0.0000006,
+        "litellm_provider": "bedrock",
+        "mode": "chat"
+    },
+    "meta.llama3-70b-instruct-v1:0": {
+        "max_tokens": 8192, 
+        "max_input_tokens": 8192, 
+        "max_output_tokens": 8192, 
+        "input_cost_per_token": 0.00000265,
+        "output_cost_per_token": 0.0000035,
+        "litellm_provider": "bedrock",
+        "mode": "chat"
+    },
    "512-x-512/50-steps/stability.stable-diffusion-xl-v0": {
        "max_tokens": 77, 
        "max_input_tokens": 77, 
--- a/litellm/proxy/_experimental/out/404.html
+++ b/litellm/proxy/_experimental/out/404.html
--- a/litellm/proxy/_experimental/out/_next/static/c5rha8cqAah-saaczjn02/_buildManifest.js
+++ b/litellm/proxy/_experimental/out/_next/static/c5rha8cqAah-saaczjn02/_buildManifest.js
--- a/litellm/proxy/_experimental/out/_next/static/c5rha8cqAah-saaczjn02/_ssgManifest.js
+++ b/litellm/proxy/_experimental/out/_next/static/c5rha8cqAah-saaczjn02/_ssgManifest.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/127-efd0436630e294eb.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/127-efd0436630e294eb.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/386-d811195b597a2122.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/386-d811195b597a2122.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/app/layout-bdfb585eb82bdab5.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/app/layout-bdfb585eb82bdab5.js
@ -0,0 +1 @@
+(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[185],{93553:function(n,e,t){Promise.resolve().then(t.t.bind(t,63385,23)),Promise.resolve().then(t.t.bind(t,99646,23))},63385:function(){},99646:function(n){n.exports={style:{fontFamily:"'__Inter_12bbc4', '__Inter_Fallback_12bbc4'",fontStyle:"normal"},className:"__className_12bbc4"}}},function(n){n.O(0,[971,69,744],function(){return n(n.s=93553)}),_N_E=n.O()}]);
--- a/litellm/proxy/_experimental/out/_next/static/chunks/app/layout-ecc8c750567f72a0.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/app/layout-ecc8c750567f72a0.js
@ -1 +0,0 @@
-(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[185],{87421:function(n,e,t){Promise.resolve().then(t.t.bind(t,99646,23)),Promise.resolve().then(t.t.bind(t,63385,23))},63385:function(){},99646:function(n){n.exports={style:{fontFamily:"'__Inter_c23dc8', '__Inter_Fallback_c23dc8'",fontStyle:"normal"},className:"__className_c23dc8"}}},function(n){n.O(0,[971,69,744],function(){return n(n.s=87421)}),_N_E=n.O()}]);
--- a/litellm/proxy/_experimental/out/_next/static/chunks/app/page-525d83925fd5350b.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/app/page-525d83925fd5350b.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/app/page-5a4a198eefedc775.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/app/page-5a4a198eefedc775.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/app/page-e0ee34389254cdf2.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/app/page-e0ee34389254cdf2.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/main-app-096338c8e1915716.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/main-app-096338c8e1915716.js
@ -1 +1 @@
-(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[744],{32028:function(e,n,t){Promise.resolve().then(t.t.bind(t,47690,23)),Promise.resolve().then(t.t.bind(t,48955,23)),Promise.resolve().then(t.t.bind(t,5613,23)),Promise.resolve().then(t.t.bind(t,11902,23)),Promise.resolve().then(t.t.bind(t,31778,23)),Promise.resolve().then(t.t.bind(t,77831,23))}},function(e){var n=function(n){return e(e.s=n)};e.O(0,[971,69],function(){return n(35317),n(32028)}),_N_E=e.O()}]);
+(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[744],{70377:function(e,n,t){Promise.resolve().then(t.t.bind(t,47690,23)),Promise.resolve().then(t.t.bind(t,48955,23)),Promise.resolve().then(t.t.bind(t,5613,23)),Promise.resolve().then(t.t.bind(t,11902,23)),Promise.resolve().then(t.t.bind(t,31778,23)),Promise.resolve().then(t.t.bind(t,77831,23))}},function(e){var n=function(n){return e(e.s=n)};e.O(0,[971,69],function(){return n(35317),n(70377)}),_N_E=e.O()}]);
--- a/litellm/proxy/_experimental/out/_next/static/chunks/main-app-9b4fb13a7db53edf.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/main-app-9b4fb13a7db53edf.js
@ -1 +0,0 @@
-(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[744],{32028:function(e,n,t){Promise.resolve().then(t.t.bind(t,47690,23)),Promise.resolve().then(t.t.bind(t,48955,23)),Promise.resolve().then(t.t.bind(t,5613,23)),Promise.resolve().then(t.t.bind(t,11902,23)),Promise.resolve().then(t.t.bind(t,31778,23)),Promise.resolve().then(t.t.bind(t,77831,23))}},function(e){var n=function(n){return e(e.s=n)};e.O(0,[971,69],function(){return n(35317),n(32028)}),_N_E=e.O()}]);
--- a/litellm/proxy/_experimental/out/_next/static/chunks/webpack-65a932b4e8bd8abb.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/webpack-65a932b4e8bd8abb.js
@ -1 +1 @@
-!function(){"use strict";var e,t,n,r,o,u,i,c,f,a={},l={};function d(e){var t=l[e];if(void 0!==t)return t.exports;var n=l[e]={id:e,loaded:!1,exports:{}},r=!0;try{a[e](n,n.exports,d),r=!1}finally{r&&delete l[e]}return n.loaded=!0,n.exports}d.m=a,e=[],d.O=function(t,n,r,o){if(n){o=o||0;for(var u=e.length;u>0&&e[u-1][2]>o;u--)e[u]=e[u-1];e[u]=[n,r,o];return}for(var i=1/0,u=0;u<e.length;u++){for(var n=e[u][0],r=e[u][1],o=e[u][2],c=!0,f=0;f<n.length;f++)i>=o&&Object.keys(d.O).every(function(e){return d.O[e](n[f])})?n.splice(f--,1):(c=!1,o<i&&(i=o));if(c){e.splice(u--,1);var a=r();void 0!==a&&(t=a)}}return t},d.n=function(e){var t=e&&e.__esModule?function(){return e.default}:function(){return e};return d.d(t,{a:t}),t},n=Object.getPrototypeOf?function(e){return Object.getPrototypeOf(e)}:function(e){return e.__proto__},d.t=function(e,r){if(1&r&&(e=this(e)),8&r||"object"==typeof e&&e&&(4&r&&e.__esModule||16&r&&"function"==typeof e.then))return e;var o=Object.create(null);d.r(o);var u={};t=t||[null,n({}),n([]),n(n)];for(var i=2&r&&e;"object"==typeof i&&!~t.indexOf(i);i=n(i))Object.getOwnPropertyNames(i).forEach(function(t){u[t]=function(){return e[t]}});return u.default=function(){return e},d.d(o,u),o},d.d=function(e,t){for(var n in t)d.o(t,n)&&!d.o(e,n)&&Object.defineProperty(e,n,{enumerable:!0,get:t[n]})},d.f={},d.e=function(e){return Promise.all(Object.keys(d.f).reduce(function(t,n){return d.f[n](e,t),t},[]))},d.u=function(e){},d.miniCssF=function(e){return"static/css/60d9f441227ccc7e.css"},d.g=function(){if("object"==typeof globalThis)return globalThis;try{return this||Function("return this")()}catch(e){if("object"==typeof window)return window}}(),d.o=function(e,t){return Object.prototype.hasOwnProperty.call(e,t)},r={},o="_N_E:",d.l=function(e,t,n,u){if(r[e]){r[e].push(t);return}if(void 0!==n)for(var i,c,f=document.getElementsByTagName("script"),a=0;a<f.length;a++){var l=f[a];if(l.getAttribute("src")==e||l.getAttribute("data-webpack")==o+n){i=l;break}}i||(c=!0,(i=document.createElement("script")).charset="utf-8",i.timeout=120,d.nc&&i.setAttribute("nonce",d.nc),i.setAttribute("data-webpack",o+n),i.src=d.tu(e)),r[e]=[t];var s=function(t,n){i.onerror=i.onload=null,clearTimeout(p);var o=r[e];if(delete r[e],i.parentNode&&i.parentNode.removeChild(i),o&&o.forEach(function(e){return e(n)}),t)return t(n)},p=setTimeout(s.bind(null,void 0,{type:"timeout",target:i}),12e4);i.onerror=s.bind(null,i.onerror),i.onload=s.bind(null,i.onload),c&&document.head.appendChild(i)},d.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},d.nmd=function(e){return e.paths=[],e.children||(e.children=[]),e},d.tt=function(){return void 0===u&&(u={createScriptURL:function(e){return e}},"undefined"!=typeof trustedTypes&&trustedTypes.createPolicy&&(u=trustedTypes.createPolicy("nextjs#bundler",u))),u},d.tu=function(e){return d.tt().createScriptURL(e)},d.p="/ui/_next/",i={272:0},d.f.j=function(e,t){var n=d.o(i,e)?i[e]:void 0;if(0!==n){if(n)t.push(n[2]);else if(272!=e){var r=new Promise(function(t,r){n=i[e]=[t,r]});t.push(n[2]=r);var o=d.p+d.u(e),u=Error();d.l(o,function(t){if(d.o(i,e)&&(0!==(n=i[e])&&(i[e]=void 0),n)){var r=t&&("load"===t.type?"missing":t.type),o=t&&t.target&&t.target.src;u.message="Loading chunk "+e+" failed.\n("+r+": "+o+")",u.name="ChunkLoadError",u.type=r,u.request=o,n[1](u)}},"chunk-"+e,e)}else i[e]=0}},d.O.j=function(e){return 0===i[e]},c=function(e,t){var n,r,o=t[0],u=t[1],c=t[2],f=0;if(o.some(function(e){return 0!==i[e]})){for(n in u)d.o(u,n)&&(d.m[n]=u[n]);if(c)var a=c(d)}for(e&&e(t);f<o.length;f++)r=o[f],d.o(i,r)&&i[r]&&i[r][0](),i[r]=0;return d.O(a)},(f=self.webpackChunk_N_E=self.webpackChunk_N_E||[]).forEach(c.bind(null,0)),f.push=c.bind(null,f.push.bind(f))}();
+!function(){"use strict";var e,t,n,r,o,u,i,c,f,a={},l={};function d(e){var t=l[e];if(void 0!==t)return t.exports;var n=l[e]={id:e,loaded:!1,exports:{}},r=!0;try{a[e](n,n.exports,d),r=!1}finally{r&&delete l[e]}return n.loaded=!0,n.exports}d.m=a,e=[],d.O=function(t,n,r,o){if(n){o=o||0;for(var u=e.length;u>0&&e[u-1][2]>o;u--)e[u]=e[u-1];e[u]=[n,r,o];return}for(var i=1/0,u=0;u<e.length;u++){for(var n=e[u][0],r=e[u][1],o=e[u][2],c=!0,f=0;f<n.length;f++)i>=o&&Object.keys(d.O).every(function(e){return d.O[e](n[f])})?n.splice(f--,1):(c=!1,o<i&&(i=o));if(c){e.splice(u--,1);var a=r();void 0!==a&&(t=a)}}return t},d.n=function(e){var t=e&&e.__esModule?function(){return e.default}:function(){return e};return d.d(t,{a:t}),t},n=Object.getPrototypeOf?function(e){return Object.getPrototypeOf(e)}:function(e){return e.__proto__},d.t=function(e,r){if(1&r&&(e=this(e)),8&r||"object"==typeof e&&e&&(4&r&&e.__esModule||16&r&&"function"==typeof e.then))return e;var o=Object.create(null);d.r(o);var u={};t=t||[null,n({}),n([]),n(n)];for(var i=2&r&&e;"object"==typeof i&&!~t.indexOf(i);i=n(i))Object.getOwnPropertyNames(i).forEach(function(t){u[t]=function(){return e[t]}});return u.default=function(){return e},d.d(o,u),o},d.d=function(e,t){for(var n in t)d.o(t,n)&&!d.o(e,n)&&Object.defineProperty(e,n,{enumerable:!0,get:t[n]})},d.f={},d.e=function(e){return Promise.all(Object.keys(d.f).reduce(function(t,n){return d.f[n](e,t),t},[]))},d.u=function(e){},d.miniCssF=function(e){return"static/css/9f51f0573c6b0365.css"},d.g=function(){if("object"==typeof globalThis)return globalThis;try{return this||Function("return this")()}catch(e){if("object"==typeof window)return window}}(),d.o=function(e,t){return Object.prototype.hasOwnProperty.call(e,t)},r={},o="_N_E:",d.l=function(e,t,n,u){if(r[e]){r[e].push(t);return}if(void 0!==n)for(var i,c,f=document.getElementsByTagName("script"),a=0;a<f.length;a++){var l=f[a];if(l.getAttribute("src")==e||l.getAttribute("data-webpack")==o+n){i=l;break}}i||(c=!0,(i=document.createElement("script")).charset="utf-8",i.timeout=120,d.nc&&i.setAttribute("nonce",d.nc),i.setAttribute("data-webpack",o+n),i.src=d.tu(e)),r[e]=[t];var s=function(t,n){i.onerror=i.onload=null,clearTimeout(p);var o=r[e];if(delete r[e],i.parentNode&&i.parentNode.removeChild(i),o&&o.forEach(function(e){return e(n)}),t)return t(n)},p=setTimeout(s.bind(null,void 0,{type:"timeout",target:i}),12e4);i.onerror=s.bind(null,i.onerror),i.onload=s.bind(null,i.onload),c&&document.head.appendChild(i)},d.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},d.nmd=function(e){return e.paths=[],e.children||(e.children=[]),e},d.tt=function(){return void 0===u&&(u={createScriptURL:function(e){return e}},"undefined"!=typeof trustedTypes&&trustedTypes.createPolicy&&(u=trustedTypes.createPolicy("nextjs#bundler",u))),u},d.tu=function(e){return d.tt().createScriptURL(e)},d.p="/ui/_next/",i={272:0},d.f.j=function(e,t){var n=d.o(i,e)?i[e]:void 0;if(0!==n){if(n)t.push(n[2]);else if(272!=e){var r=new Promise(function(t,r){n=i[e]=[t,r]});t.push(n[2]=r);var o=d.p+d.u(e),u=Error();d.l(o,function(t){if(d.o(i,e)&&(0!==(n=i[e])&&(i[e]=void 0),n)){var r=t&&("load"===t.type?"missing":t.type),o=t&&t.target&&t.target.src;u.message="Loading chunk "+e+" failed.\n("+r+": "+o+")",u.name="ChunkLoadError",u.type=r,u.request=o,n[1](u)}},"chunk-"+e,e)}else i[e]=0}},d.O.j=function(e){return 0===i[e]},c=function(e,t){var n,r,o=t[0],u=t[1],c=t[2],f=0;if(o.some(function(e){return 0!==i[e]})){for(n in u)d.o(u,n)&&(d.m[n]=u[n]);if(c)var a=c(d)}for(e&&e(t);f<o.length;f++)r=o[f],d.o(i,r)&&i[r]&&i[r][0](),i[r]=0;return d.O(a)},(f=self.webpackChunk_N_E=self.webpackChunk_N_E||[]).forEach(c.bind(null,0)),f.push=c.bind(null,f.push.bind(f))}();
--- a/litellm/proxy/_experimental/out/_next/static/css/60d9f441227ccc7e.css
+++ b/litellm/proxy/_experimental/out/_next/static/css/60d9f441227ccc7e.css
--- a/litellm/proxy/_experimental/out/_next/static/css/9f51f0573c6b0365.css
+++ b/litellm/proxy/_experimental/out/_next/static/css/9f51f0573c6b0365.css
--- a/litellm/proxy/_experimental/out/_next/static/dWGL92c5LzTMn7XX6utn2/_buildManifest.js
+++ b/litellm/proxy/_experimental/out/_next/static/dWGL92c5LzTMn7XX6utn2/_buildManifest.js
--- a/litellm/proxy/_experimental/out/_next/static/dWGL92c5LzTMn7XX6utn2/_ssgManifest.js
+++ b/litellm/proxy/_experimental/out/_next/static/dWGL92c5LzTMn7XX6utn2/_ssgManifest.js
--- a/litellm/proxy/_experimental/out/index.html
+++ b/litellm/proxy/_experimental/out/index.html
@ -1 +1,5 @@
-<!DOCTYPE html><html id="__next_error__"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-50c1dadc6557c101.js" crossorigin=""/><script src="/ui/_next/static/chunks/fd9d1056-dafd44dfa2da140c.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/69-e49705773ae41779.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/main-app-9b4fb13a7db53edf.js" async="" crossorigin=""></script><title>LiteLLM Dashboard</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js" crossorigin="" noModule=""></script></head><body><script src="/ui/_next/static/chunks/webpack-50c1dadc6557c101.js" crossorigin="" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/60d9f441227ccc7e.css\",\"style\",{\"crossOrigin\":\"\"}]\n0:\"$L3\"\n"])</script><script>self.__next_f.push([1,"4:I[47690,[],\"\"]\n6:I[77831,[],\"\"]\n7:I[82332,[\"127\",\"static/chunks/127-efd0436630e294eb.js\",\"931\",\"static/chunks/app/page-525d83925fd5350b.js\"],\"\"]\n8:I[5613,[],\"\"]\n9:I[31778,[],\"\"]\nb:I[48955,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"3:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/60d9f441227ccc7e.css\",\"precedence\":\"next\",\"crossOrigin\":\"\"}]],[\"$\",\"$L4\",null,{\"buildId\":\"Csz8BqWx6JEoKsgLqCeCt\",\"assetPrefix\":\"/ui\",\"initialCanonicalUrl\":\"/\",\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[\"$L5\",[\"$\",\"$L6\",null,{\"propsForComponent\":{\"params\":{}},\"Component\":\"$7\",\"isStaticGeneration\":true}],null]]},[null,[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_c23dc8\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[],\"styles\":null}]}]}],null]],\"initialHead\":[false,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"LiteLLM Dashboard\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script><script>self.__next_f.push([1,""])</script></body></html>
+<<<<<<< HEAD
+<!DOCTYPE html><html id="__next_error__"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-202e312607f242a1.js" crossorigin=""/><script src="/ui/_next/static/chunks/fd9d1056-dafd44dfa2da140c.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/69-e49705773ae41779.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/main-app-9b4fb13a7db53edf.js" async="" crossorigin=""></script><title>LiteLLM Dashboard</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js" crossorigin="" noModule=""></script></head><body><script src="/ui/_next/static/chunks/webpack-202e312607f242a1.js" crossorigin="" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/00c2ddbcd01819c0.css\",\"style\",{\"crossOrigin\":\"\"}]\n0:\"$L3\"\n"])</script><script>self.__next_f.push([1,"4:I[47690,[],\"\"]\n6:I[77831,[],\"\"]\n7:I[46414,[\"761\",\"static/chunks/761-05f8a8451296476c.js\",\"931\",\"static/chunks/app/page-5a4a198eefedc775.js\"],\"\"]\n8:I[5613,[],\"\"]\n9:I[31778,[],\"\"]\nb:I[48955,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"3:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/00c2ddbcd01819c0.css\",\"precedence\":\"next\",\"crossOrigin\":\"\"}]],[\"$\",\"$L4\",null,{\"buildId\":\"c5rha8cqAah-saaczjn02\",\"assetPrefix\":\"/ui\",\"initialCanonicalUrl\":\"/\",\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[\"$L5\",[\"$\",\"$L6\",null,{\"propsForComponent\":{\"params\":{}},\"Component\":\"$7\",\"isStaticGeneration\":true}],null]]},[null,[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_c23dc8\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[],\"styles\":null}]}]}],null]],\"initialHead\":[false,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"LiteLLM Dashboard\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script><script>self.__next_f.push([1,""])</script></body></html>
+=======
+<!DOCTYPE html><html id="__next_error__"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-65a932b4e8bd8abb.js" crossorigin=""/><script src="/ui/_next/static/chunks/fd9d1056-dafd44dfa2da140c.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/69-e49705773ae41779.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/main-app-096338c8e1915716.js" async="" crossorigin=""></script><title>LiteLLM Dashboard</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js" crossorigin="" noModule=""></script></head><body><script src="/ui/_next/static/chunks/webpack-65a932b4e8bd8abb.js" crossorigin="" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/9f51f0573c6b0365.css\",\"style\",{\"crossOrigin\":\"\"}]\n0:\"$L3\"\n"])</script><script>self.__next_f.push([1,"4:I[47690,[],\"\"]\n6:I[77831,[],\"\"]\n7:I[46414,[\"386\",\"static/chunks/386-d811195b597a2122.js\",\"931\",\"static/chunks/app/page-e0ee34389254cdf2.js\"],\"\"]\n8:I[5613,[],\"\"]\n9:I[31778,[],\"\"]\nb:I[48955,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"3:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/9f51f0573c6b0365.css\",\"precedence\":\"next\",\"crossOrigin\":\"\"}]],[\"$\",\"$L4\",null,{\"buildId\":\"dWGL92c5LzTMn7XX6utn2\",\"assetPrefix\":\"/ui\",\"initialCanonicalUrl\":\"/\",\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[\"$L5\",[\"$\",\"$L6\",null,{\"propsForComponent\":{\"params\":{}},\"Component\":\"$7\",\"isStaticGeneration\":true}],null]]},[null,[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_12bbc4\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[],\"styles\":null}]}]}],null]],\"initialHead\":[false,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"LiteLLM Dashboard\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script><script>self.__next_f.push([1,""])</script></body></html>
+>>>>>>> 73a7b4f4 (refactor(main.py): trigger new build)
--- a/litellm/proxy/_experimental/out/index.txt
+++ b/litellm/proxy/_experimental/out/index.txt
@ -1,7 +1,14 @@
 2:I[77831,[],""]
-3:I[82332,["127","static/chunks/127-efd0436630e294eb.js","931","static/chunks/app/page-525d83925fd5350b.js"],""]
+<<<<<<< HEAD
+3:I[46414,["761","static/chunks/761-05f8a8451296476c.js","931","static/chunks/app/page-5a4a198eefedc775.js"],""]
 4:I[5613,[],""]
 5:I[31778,[],""]
-0:["Csz8BqWx6JEoKsgLqCeCt",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_c23dc8","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/60d9f441227ccc7e.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
+0:["c5rha8cqAah-saaczjn02",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_c23dc8","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/00c2ddbcd01819c0.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
+=======
+3:I[46414,["386","static/chunks/386-d811195b597a2122.js","931","static/chunks/app/page-e0ee34389254cdf2.js"],""]
+4:I[5613,[],""]
+5:I[31778,[],""]
+0:["dWGL92c5LzTMn7XX6utn2",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_12bbc4","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/9f51f0573c6b0365.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
+>>>>>>> 73a7b4f4 (refactor(main.py): trigger new build)
 6:[["$","meta","0",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","meta","1",{"charSet":"utf-8"}],["$","title","2",{"children":"LiteLLM Dashboard"}],["$","meta","3",{"name":"description","content":"LiteLLM Proxy Admin UI"}],["$","link","4",{"rel":"icon","href":"/ui/favicon.ico","type":"image/x-icon","sizes":"16x16"}],["$","meta","5",{"name":"next-size-adjust"}]]
 1:null
--- a/litellm/proxy/_super_secret_config.yaml
+++ b/litellm/proxy/_super_secret_config.yaml
@ -1,51 +1,15 @@
-environment_variables:
-  SLACK_WEBHOOK_URL: SQD2/FQHvDuj6Q9/Umyqi+EKLNKKLRCXETX2ncO0xCIQp6EHCKiYD7jPW0+1QdrsQ+pnEzhsfVY2r21SiQV901n/9iyJ2tSnEyWViP7FKQVtTvwutsAqSqbiVHxLHbpjPCu03fhS/idjZrtK7dJLbLBB3RgudjNjHg==
-general_settings:
-  alerting:
-  - slack
-  alerting_threshold: 300
-  database_connection_pool_limit: 100
-  database_connection_timeout: 60
-  health_check_interval: 300
-  proxy_batch_write_at: 10
-  ui_access_mode: all
-litellm_settings:
-  allowed_fails: 3
-  failure_callback:
-  - prometheus
-  fallbacks:
-  - gpt-3.5-turbo:
-    - fake-openai-endpoint
-    - gpt-4
-  num_retries: 3
-  service_callback:
-  - prometheus_system
-  success_callback:
-  - prometheus
 model_list:
 - litellm_params:
    api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/
    api_key: my-fake-key
    model: openai/my-fake-model
  model_name: fake-openai-endpoint
- litellm_params:
-    model: gpt-3.5-turbo
-  model_name: gpt-3.5-turbo
- model_name: llama-3
-  litellm_params:
-    model: replicate/meta/meta-llama-3-8b-instruct
 router_settings:
-  allowed_fails: 3
-  context_window_fallbacks: null
-  cooldown_time: 1
-  fallbacks:
-  - gpt-3.5-turbo:
-    - fake-openai-endpoint
-    - gpt-4
-  - gpt-3.5-turbo-3:
-    - fake-openai-endpoint
-  num_retries: 3
-  retry_after: 0
-  routing_strategy: simple-shuffle
-  routing_strategy_args: {}
-  timeout: 6000
+  num_retries: 0
+  enable_pre_call_checks: true
+  redis_host: os.environ/REDIS_HOST
+  redis_password: os.environ/REDIS_PASSWORD
+  redis_port: os.environ/REDIS_PORT
+
+litellm_settings:
+  success_callback: ["openmeter"]
--- a/litellm/proxy/_types.py
+++ b/litellm/proxy/_types.py
@ -422,6 +422,9 @@ class LiteLLM_ModelTable(LiteLLMBase):
    created_by: str
    updated_by: str

+    class Config:
+        protected_namespaces = ()
+

 class NewUserRequest(GenerateKeyRequest):
    max_budget: Optional[float] = None
@ -485,6 +488,9 @@ class TeamBase(LiteLLMBase):
 class NewTeamRequest(TeamBase):
    model_aliases: Optional[dict] = None

+    class Config:
+        protected_namespaces = ()
+

 class GlobalEndUsersSpend(LiteLLMBase):
    api_key: Optional[str] = None
@ -534,6 +540,9 @@ class LiteLLM_TeamTable(TeamBase):
    budget_reset_at: Optional[datetime] = None
    model_id: Optional[int] = None

+    class Config:
+        protected_namespaces = ()
+
    @root_validator(pre=True)
    def set_model_info(cls, values):
        dict_fields = [
@ -570,6 +579,9 @@ class LiteLLM_BudgetTable(LiteLLMBase):
    model_max_budget: Optional[dict] = None
    budget_duration: Optional[str] = None

+    class Config:
+        protected_namespaces = ()
+

 class NewOrganizationRequest(LiteLLM_BudgetTable):
    organization_id: Optional[str] = None
@ -720,6 +732,10 @@ class ConfigGeneralSettings(LiteLLMBase):
        None,
        description="List of alerting types. By default it is all alerts",
    )
+    alert_to_webhook_url: Optional[Dict] = Field(
+        None,
+        description="Mapping of alert type to webhook url. e.g. `alert_to_webhook_url: {'budget_alerts': 'https://hooks.slack.com/services/T00000000/B00000000/XXXXXXXXXXXXXXXXXXXXXXXX'}`",
+    )

    alerting_threshold: Optional[int] = Field(
        None,
@ -896,5 +912,19 @@ class LiteLLM_SpendLogs(LiteLLMBase):
    request_tags: Optional[Json] = None


+class LiteLLM_ErrorLogs(LiteLLMBase):
+    request_id: Optional[str] = str(uuid.uuid4())
+    api_base: Optional[str] = ""
+    model_group: Optional[str] = ""
+    litellm_model_name: Optional[str] = ""
+    model_id: Optional[str] = ""
+    request_kwargs: Optional[dict] = {}
+    exception_type: Optional[str] = ""
+    status_code: Optional[str] = ""
+    exception_string: Optional[str] = ""
+    startTime: Union[str, datetime, None]
+    endTime: Union[str, datetime, None]
+
+
 class LiteLLM_SpendLogs_ResponseObject(LiteLLMBase):
    response: Optional[List[Union[LiteLLM_SpendLogs, Any]]] = None
--- a/litellm/proxy/auth/auth_checks.py
+++ b/litellm/proxy/auth/auth_checks.py
@ -95,7 +95,15 @@ def common_checks(
                f"'user' param not passed in. 'enforce_user_param'={general_settings['enforce_user_param']}"
            )
    # 7. [OPTIONAL] If 'litellm.max_budget' is set (>0), is proxy under budget
-    if litellm.max_budget > 0 and global_proxy_spend is not None:
+    if (
+        litellm.max_budget > 0
+        and global_proxy_spend is not None
+        # only run global budget checks for OpenAI routes
+        # Reason - the Admin UI should continue working if the proxy crosses it's global budget
+        and route in LiteLLMRoutes.openai_routes.value
+        and route != "/v1/models"
+        and route != "/models"
+    ):
        if global_proxy_spend > litellm.max_budget:
            raise Exception(
                f"ExceededBudget: LiteLLM Proxy has exceeded its budget. Current spend: {global_proxy_spend}; Max Budget: {litellm.max_budget}"
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@ -1059,8 +1059,18 @@ async def user_api_key_auth(
                ):
                    pass
                else:
+                    user_role = "unknown"
+                    user_id = "unknown"
+                    if user_id_information is not None and isinstance(
+                        user_id_information, list
+                    ):
+                        _user = user_id_information[0]
+                        user_role = _user.get("user_role", {}).get(
+                            "user_role", "unknown"
+                        )
+                        user_id = _user.get("user_id", "unknown")
                    raise Exception(
-                        f"Only master key can be used to generate, delete, update info for new keys/users/teams. Route={route}"
+                        f"Only proxy admin can be used to generate, delete, update info for new keys/users/teams. Route={route}. Your role={user_role}. Your user_id={user_id}"
                    )

        # check if token is from litellm-ui, litellm ui makes keys to allow users to login with sso. These keys can only be used for LiteLLM UI functions
@ -1207,6 +1217,68 @@ def cost_tracking():
                litellm.success_callback.append(_PROXY_track_cost_callback)  # type: ignore


+async def _PROXY_failure_handler(
+    kwargs,  # kwargs to completion
+    completion_response: litellm.ModelResponse,  # response from completion
+    start_time=None,
+    end_time=None,  # start/end time for completion
+):
+    global prisma_client
+    if prisma_client is not None:
+        verbose_proxy_logger.debug(
+            "inside _PROXY_failure_handler kwargs=", extra=kwargs
+        )
+
+        _exception = kwargs.get("exception")
+        _exception_type = _exception.__class__.__name__
+        _model = kwargs.get("model", None)
+
+        _optional_params = kwargs.get("optional_params", {})
+        _optional_params = copy.deepcopy(_optional_params)
+
+        for k, v in _optional_params.items():
+            v = str(v)
+            v = v[:100]
+
+        _status_code = "500"
+        try:
+            _status_code = str(_exception.status_code)
+        except:
+            # Don't let this fail logging the exception to the dB
+            pass
+
+        _litellm_params = kwargs.get("litellm_params", {}) or {}
+        _metadata = _litellm_params.get("metadata", {}) or {}
+        _model_id = _metadata.get("model_info", {}).get("id", "")
+        _model_group = _metadata.get("model_group", "")
+        api_base = litellm.get_api_base(model=_model, optional_params=_litellm_params)
+        _exception_string = str(_exception)[:500]
+
+        error_log = LiteLLM_ErrorLogs(
+            request_id=str(uuid.uuid4()),
+            model_group=_model_group,
+            model_id=_model_id,
+            litellm_model_name=kwargs.get("model"),
+            request_kwargs=_optional_params,
+            api_base=api_base,
+            exception_type=_exception_type,
+            status_code=_status_code,
+            exception_string=_exception_string,
+            startTime=kwargs.get("start_time"),
+            endTime=kwargs.get("end_time"),
+        )
+
+        # helper function to convert to dict on pydantic v2 & v1
+        error_log_dict = _get_pydantic_json_dict(error_log)
+        error_log_dict["request_kwargs"] = json.dumps(error_log_dict["request_kwargs"])
+
+        await prisma_client.db.litellm_errorlogs.create(
+            data=error_log_dict  # type: ignore
+        )
+
+    pass
+
+
 async def _PROXY_track_cost_callback(
    kwargs,  # kwargs to completion
    completion_response: litellm.ModelResponse,  # response from completion
@ -1292,6 +1364,15 @@ async def _PROXY_track_cost_callback(
        verbose_proxy_logger.debug("error in tracking cost callback - %s", e)


+def error_tracking():
+    global prisma_client, custom_db_client
+    if prisma_client is not None or custom_db_client is not None:
+        if isinstance(litellm.failure_callback, list):
+            verbose_proxy_logger.debug("setting litellm failure callback to track cost")
+            if (_PROXY_failure_handler) not in litellm.failure_callback:  # type: ignore
+                litellm.failure_callback.append(_PROXY_failure_handler)  # type: ignore
+
+
 def _set_spend_logs_payload(
    payload: dict, prisma_client: PrismaClient, spend_logs_url: Optional[str] = None
 ):
@ -2612,6 +2693,7 @@ class ProxyConfig:
        environment_variables = config_data.get("environment_variables", {})
        for k, v in environment_variables.items():
            try:
+                if v is not None:
                    decoded_b64 = base64.b64decode(v)
                    value = decrypt_value(value=decoded_b64, master_key=master_key)  # type: ignore
                    os.environ[k] = value
@ -2632,9 +2714,17 @@ class ProxyConfig:
        if "alert_types" in _general_settings:
            general_settings["alert_types"] = _general_settings["alert_types"]
            proxy_logging_obj.alert_types = general_settings["alert_types"]
-            proxy_logging_obj.slack_alerting_instance.alert_types = general_settings[
-                "alert_types"
+            proxy_logging_obj.slack_alerting_instance.update_values(
+                alert_types=general_settings["alert_types"]
+            )
+
+        if "alert_to_webhook_url" in _general_settings:
+            general_settings["alert_to_webhook_url"] = _general_settings[
+                "alert_to_webhook_url"
            ]
+            proxy_logging_obj.slack_alerting_instance.update_values(
+                alert_to_webhook_url=general_settings["alert_to_webhook_url"]
+            )

        # router settings
        if llm_router is not None and prisma_client is not None:
@ -3176,6 +3266,9 @@ async def startup_event():
    ## COST TRACKING ##
    cost_tracking()

+    ## Error Tracking ##
+    error_tracking()
+
    db_writer_client = HTTPHandler()

    proxy_logging_obj._init_litellm_callbacks()  # INITIALIZE LITELLM CALLBACKS ON SERVER STARTUP <- do this to catch any logging errors on startup, not when calls are being made
@ -3655,6 +3748,17 @@ async def chat_completion(
        if data["model"] in litellm.model_alias_map:
            data["model"] = litellm.model_alias_map[data["model"]]

+        ## LOGGING OBJECT ## - initialize logging object for logging success/failure events for call
+        data["litellm_call_id"] = str(uuid.uuid4())
+        logging_obj, data = litellm.utils.function_setup(
+            original_function="acompletion",
+            rules_obj=litellm.utils.Rules(),
+            start_time=datetime.now(),
+            **data,
+        )
+
+        data["litellm_logging_obj"] = logging_obj
+
        ### CALL HOOKS ### - modify incoming data before calling the model
        data = await proxy_logging_obj.pre_call_hook(
            user_api_key_dict=user_api_key_dict, data=data, call_type="completion"
@ -7421,9 +7525,9 @@ async def model_info_v2(
 )
 async def model_metrics(
    user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
-    _selected_model_group: Optional[str] = None,
-    startTime: Optional[datetime] = datetime.now() - timedelta(days=30),
-    endTime: Optional[datetime] = datetime.now(),
+    _selected_model_group: Optional[str] = "gpt-4-32k",
+    startTime: Optional[datetime] = None,
+    endTime: Optional[datetime] = None,
 ):
    global prisma_client, llm_router
    if prisma_client is None:
@ -7433,65 +7537,214 @@ async def model_metrics(
            param="None",
            code=status.HTTP_500_INTERNAL_SERVER_ERROR,
        )
-    if _selected_model_group and llm_router is not None:
-        _model_list = llm_router.get_model_list()
-        _relevant_api_bases = []
-        for model in _model_list:
-            if model["model_name"] == _selected_model_group:
-                _litellm_params = model["litellm_params"]
-                _api_base = _litellm_params.get("api_base", "")
-                _relevant_api_bases.append(_api_base)
-                _relevant_api_bases.append(_api_base + "/openai/")
+    startTime = startTime or datetime.now() - timedelta(days=30)
+    endTime = endTime or datetime.now()

    sql_query = """
        SELECT
-                CASE WHEN api_base = '' THEN model ELSE CONCAT(model, '-', api_base) END AS combined_model_api_base,
-                COUNT(*) AS num_requests,
-                AVG(EXTRACT(epoch FROM ("endTime" - "startTime"))) AS avg_latency_seconds
-            FROM "LiteLLM_SpendLogs"
-            WHERE "startTime" >= $1::timestamp AND "endTime" <= $2::timestamp
-            AND api_base = ANY($3)
-            GROUP BY CASE WHEN api_base = '' THEN model ELSE CONCAT(model, '-', api_base) END
-            ORDER BY num_requests DESC
-            LIMIT 50;
+            api_base,
+            model,
+            DATE_TRUNC('day', "startTime")::DATE AS day,
+            AVG(EXTRACT(epoch FROM ("endTime" - "startTime"))) / SUM(total_tokens) AS avg_latency_per_token
+        FROM
+            "LiteLLM_SpendLogs"
+        WHERE
+            "startTime" >= NOW() - INTERVAL '30 days'
+            AND "model" = $1 AND "cache_hit" != 'True'
+        GROUP BY
+            api_base,
+            model,
+            day
+        HAVING
+            SUM(total_tokens) > 0
+        ORDER BY
+            avg_latency_per_token DESC;
+    """
+    _all_api_bases = set()
+    db_response = await prisma_client.db.query_raw(
+        sql_query, _selected_model_group, startTime, endTime
+    )
+    _daily_entries: dict = {}  # {"Jun 23": {"model1": 0.002, "model2": 0.003}}
+    if db_response is not None:
+        for model_data in db_response:
+            _api_base = model_data["api_base"]
+            _model = model_data["model"]
+            _day = model_data["day"]
+            _avg_latency_per_token = model_data["avg_latency_per_token"]
+            if _day not in _daily_entries:
+                _daily_entries[_day] = {}
+            _combined_model_name = str(_model)
+            if "https://" in _api_base:
+                _combined_model_name = str(_api_base)
+            if "/openai/" in _combined_model_name:
+                _combined_model_name = _combined_model_name.split("/openai/")[0]
+
+            _all_api_bases.add(_combined_model_name)
+            _daily_entries[_day][_combined_model_name] = _avg_latency_per_token
+
+        """
+        each entry needs to be like this:
+        {
+            date: 'Jun 23',
+            'gpt-4-https://api.openai.com/v1/': 0.002,
+            'gpt-43-https://api.openai.com-12/v1/': 0.002,
+        }
+        """
+        # convert daily entries to list of dicts
+
+        response: List[dict] = []
+
+        # sort daily entries by date
+        _daily_entries = dict(sorted(_daily_entries.items(), key=lambda item: item[0]))
+        for day in _daily_entries:
+            entry = {"date": str(day)}
+            for model_key, latency in _daily_entries[day].items():
+                entry[model_key] = latency
+            response.append(entry)
+
+        return {
+            "data": response,
+            "all_api_bases": list(_all_api_bases),
+        }
+
+
+@router.get(
+    "/model/metrics/slow_responses",
+    description="View number of hanging requests per model_group",
+    tags=["model management"],
+    include_in_schema=False,
+    dependencies=[Depends(user_api_key_auth)],
+)
+async def model_metrics_slow_responses(
+    user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
+    _selected_model_group: Optional[str] = "gpt-4-32k",
+    startTime: Optional[datetime] = None,
+    endTime: Optional[datetime] = None,
+):
+    global prisma_client, llm_router, proxy_logging_obj
+    if prisma_client is None:
+        raise ProxyException(
+            message="Prisma Client is not initialized",
+            type="internal_error",
+            param="None",
+            code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+        )
+    startTime = startTime or datetime.now() - timedelta(days=30)
+    endTime = endTime or datetime.now()
+
+    alerting_threshold = (
+        proxy_logging_obj.slack_alerting_instance.alerting_threshold or 300
+    )
+    alerting_threshold = int(alerting_threshold)
+
+    sql_query = """
+SELECT
+    api_base,
+    COUNT(*) AS total_count,
+    SUM(CASE
+        WHEN ("endTime" - "startTime") >= (INTERVAL '1 SECOND' * CAST($1 AS INTEGER)) THEN 1
+        ELSE 0
+    END) AS slow_count
+FROM
+    "LiteLLM_SpendLogs"
+WHERE
+    "model" = $2
+    AND "cache_hit" != 'True'
+GROUP BY
+    api_base
+ORDER BY
+    slow_count DESC;
    """

    db_response = await prisma_client.db.query_raw(
-            sql_query, startTime, endTime, _relevant_api_bases
+        sql_query, alerting_threshold, _selected_model_group
    )
-    else:

-        sql_query = """
-            SELECT
-                CASE WHEN api_base = '' THEN model ELSE CONCAT(model, '-', api_base) END AS combined_model_api_base,
-                COUNT(*) AS num_requests,
-                AVG(EXTRACT(epoch FROM ("endTime" - "startTime"))) AS avg_latency_seconds
-            FROM
-                "LiteLLM_SpendLogs"
-            WHERE "startTime" >= $1::timestamp AND "endTime" <= $2::timestamp
-            GROUP BY
-                CASE WHEN api_base = '' THEN model ELSE CONCAT(model, '-', api_base) END
-            ORDER BY
-                num_requests DESC
-            LIMIT 50;
+    if db_response is not None:
+        for row in db_response:
+            _api_base = row.get("api_base") or ""
+            if "/openai/" in _api_base:
+                _api_base = _api_base.split("/openai/")[0]
+            row["api_base"] = _api_base
+    return db_response
+
+
+@router.get(
+    "/model/metrics/exceptions",
+    description="View number of failed requests per model on config.yaml",
+    tags=["model management"],
+    include_in_schema=False,
+    dependencies=[Depends(user_api_key_auth)],
+)
+async def model_metrics_exceptions(
+    user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
+    _selected_model_group: Optional[str] = None,
+    startTime: Optional[datetime] = None,
+    endTime: Optional[datetime] = None,
+):
+    global prisma_client, llm_router
+    if prisma_client is None:
+        raise ProxyException(
+            message="Prisma Client is not initialized",
+            type="internal_error",
+            param="None",
+            code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+        )
+
+    startTime = startTime or datetime.now() - timedelta(days=30)
+    endTime = endTime or datetime.now()
+
+    """
+    """
+    sql_query = """
+        WITH cte AS (
+            SELECT 
+                CASE WHEN api_base = '' THEN litellm_model_name ELSE CONCAT(litellm_model_name, '-', api_base) END AS combined_model_api_base,
+                exception_type,
+                COUNT(*) AS num_exceptions
+            FROM "LiteLLM_ErrorLogs"
+            WHERE "startTime" >= $1::timestamp AND "endTime" <= $2::timestamp
+            GROUP BY combined_model_api_base, exception_type
+        )
+        SELECT 
+            combined_model_api_base,
+            COUNT(*) AS total_exceptions,
+            json_object_agg(exception_type, num_exceptions) AS exception_counts
+        FROM cte
+        GROUP BY combined_model_api_base
+        ORDER BY total_exceptions DESC
+        LIMIT 200;
    """
-
    db_response = await prisma_client.db.query_raw(sql_query, startTime, endTime)
    response: List[dict] = []
-    if response is not None:
+    exception_types = set()
+
+    """
+    Return Data
+    {
+        "combined_model_api_base": "gpt-3.5-turbo-https://api.openai.com/v1/,
+        "total_exceptions": 5,
+        "BadRequestException": 5,
+        "TimeoutException": 2
+    }
+    """
+
+    if db_response is not None:
        # loop through all models
        for model_data in db_response:
            model = model_data.get("combined_model_api_base", "")
-            num_requests = model_data.get("num_requests", 0)
-            avg_latency_seconds = model_data.get("avg_latency_seconds", 0)
-            response.append(
-                {
+            total_exceptions = model_data.get("total_exceptions", 0)
+            exception_counts = model_data.get("exception_counts", {})
+            curr_row = {
                "model": model,
-                    "num_requests": num_requests,
-                    "avg_latency_seconds": avg_latency_seconds,
+                "total_exceptions": total_exceptions,
            }
-            )
-    return response
+            curr_row.update(exception_counts)
+            response.append(curr_row)
+            for k, v in exception_counts.items():
+                exception_types.add(k)
+
+    return {"data": response, "exception_types": list(exception_types)}


@router.get(
@ -8453,6 +8706,13 @@ async def update_config(config_info: ConfigYAML):
            _existing_settings = config["general_settings"]
            for k, v in updated_general_settings.items():
                # overwrite existing settings with updated values
+                if k == "alert_to_webhook_url":
+                    # check if slack is already enabled. if not, enable it
+                    if "slack" not in _existing_settings:
+                        if "alerting" not in _existing_settings:
+                            _existing_settings["alerting"] = ["slack"]
+                        elif isinstance(_existing_settings["alerting"], list):
+                            _existing_settings["alerting"].append("slack")
                _existing_settings[k] = v
            config["general_settings"] = _existing_settings

@ -8567,7 +8827,25 @@ async def get_config():
        
        """
        for _callback in _success_callbacks:
-            if _callback == "langfuse":
+            if _callback == "openmeter":
+                env_vars = [
+                    "OPENMETER_API_KEY",
+                ]
+                env_vars_dict = {}
+                for _var in env_vars:
+                    env_variable = environment_variables.get(_var, None)
+                    if env_variable is None:
+                        env_vars_dict[_var] = None
+                    else:
+                        # decode + decrypt the value
+                        decoded_b64 = base64.b64decode(env_variable)
+                        _decrypted_value = decrypt_value(
+                            value=decoded_b64, master_key=master_key
+                        )
+                        env_vars_dict[_var] = _decrypted_value
+
+                _data_to_return.append({"name": _callback, "variables": env_vars_dict})
+            elif _callback == "langfuse":
                _langfuse_vars = [
                    "LANGFUSE_PUBLIC_KEY",
                    "LANGFUSE_SECRET_KEY",
@ -8592,6 +8870,7 @@ async def get_config():

        # Check if slack alerting is on
        _alerting = _general_settings.get("alerting", [])
+        alerting_data = []
        if "slack" in _alerting:
            _slack_vars = [
                "SLACK_WEBHOOK_URL",
@ -8600,7 +8879,8 @@ async def get_config():
            for _var in _slack_vars:
                env_variable = environment_variables.get(_var, None)
                if env_variable is None:
-                    _slack_env_vars[_var] = None
+                    _value = os.getenv("SLACK_WEBHOOK_URL", None)
+                    _slack_env_vars[_var] = _value
                else:
                    # decode + decrypt the value
                    decoded_b64 = base64.b64decode(env_variable)
@ -8613,19 +8893,23 @@ async def get_config():
            _all_alert_types = (
                proxy_logging_obj.slack_alerting_instance._all_possible_alert_types()
            )
-            _data_to_return.append(
+            _alerts_to_webhook = (
+                proxy_logging_obj.slack_alerting_instance.alert_to_webhook_url
+            )
+            alerting_data.append(
                {
                    "name": "slack",
                    "variables": _slack_env_vars,
-                    "alerting_types": _alerting_types,
-                    "all_alert_types": _all_alert_types,
+                    "active_alerts": _alerting_types,
+                    "alerts_to_webhook": _alerts_to_webhook,
                }
            )

        _router_settings = llm_router.get_settings()
        return {
            "status": "success",
-            "data": _data_to_return,
+            "callbacks": _data_to_return,
+            "alerts": alerting_data,
            "router_settings": _router_settings,
        }
    except Exception as e:
@ -8701,9 +8985,9 @@ async def test_endpoint(request: Request):
 )
 async def health_services_endpoint(
    user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
-    service: Literal["slack_budget_alerts", "langfuse", "slack"] = fastapi.Query(
-        description="Specify the service being hit."
-    ),
+    service: Literal[
+        "slack_budget_alerts", "langfuse", "slack", "openmeter"
+    ] = fastapi.Query(description="Specify the service being hit."),
 ):
    """
    Hidden endpoint.
@ -8717,7 +9001,7 @@ async def health_services_endpoint(
            raise HTTPException(
                status_code=400, detail={"error": "Service must be specified."}
            )
-        if service not in ["slack_budget_alerts", "langfuse", "slack"]:
+        if service not in ["slack_budget_alerts", "langfuse", "slack", "openmeter"]:
            raise HTTPException(
                status_code=400,
                detail={
@ -8725,6 +9009,18 @@ async def health_services_endpoint(
                },
            )

+        if service == "openmeter":
+            _ = await litellm.acompletion(
+                model="openai/litellm-mock-response-model",
+                messages=[{"role": "user", "content": "Hey, how's it going?"}],
+                user="litellm:/health/services",
+                mock_response="This is a mock response",
+            )
+            return {
+                "status": "success",
+                "message": "Mock LLM request made - check openmeter.",
+            }
+
        if service == "langfuse":
            from litellm.integrations.langfuse import LangFuseLogger

@ -8741,9 +9037,53 @@ async def health_services_endpoint(
                "message": "Mock LLM request made - check langfuse.",
            }

+        if service == "slack" or service == "slack_budget_alerts":
            if "slack" in general_settings.get("alerting", []):
-            test_message = f"""\n🚨 `ProjectedLimitExceededError` 💸\n\n`Key Alias:` litellm-ui-test-alert \n`Expected Day of Error`: 28th March \n`Current Spend`: $100.00 \n`Projected Spend at end of month`: $1000.00 \n`Soft Limit`: $700"""
-            await proxy_logging_obj.alerting_handler(message=test_message, level="Low")
+                # test_message = f"""\n🚨 `ProjectedLimitExceededError` 💸\n\n`Key Alias:` litellm-ui-test-alert \n`Expected Day of Error`: 28th March \n`Current Spend`: $100.00 \n`Projected Spend at end of month`: $1000.00 \n`Soft Limit`: $700"""
+                # check if user has opted into unique_alert_webhooks
+                if (
+                    proxy_logging_obj.slack_alerting_instance.alert_to_webhook_url
+                    is not None
+                ):
+                    for (
+                        alert_type
+                    ) in proxy_logging_obj.slack_alerting_instance.alert_to_webhook_url:
+                        """
+                        "llm_exceptions",
+                        "llm_too_slow",
+                        "llm_requests_hanging",
+                        "budget_alerts",
+                        "db_exceptions",
+                        """
+                        # only test alert if it's in active alert types
+                        if (
+                            proxy_logging_obj.slack_alerting_instance.alert_types
+                            is not None
+                            and alert_type
+                            not in proxy_logging_obj.slack_alerting_instance.alert_types
+                        ):
+                            continue
+                        test_message = "default test message"
+                        if alert_type == "llm_exceptions":
+                            test_message = f"LLM Exception test alert"
+                        elif alert_type == "llm_too_slow":
+                            test_message = f"LLM Too Slow test alert"
+                        elif alert_type == "llm_requests_hanging":
+                            test_message = f"LLM Requests Hanging test alert"
+                        elif alert_type == "budget_alerts":
+                            test_message = f"Budget Alert test alert"
+                        elif alert_type == "db_exceptions":
+                            test_message = f"DB Exception test alert"
+
+                        await proxy_logging_obj.alerting_handler(
+                            message=test_message, level="Low", alert_type=alert_type
+                        )
+                else:
+                    await proxy_logging_obj.alerting_handler(
+                        message="This is a test slack alert message",
+                        level="Low",
+                        alert_type="budget_alerts",
+                    )
                return {
                    "status": "success",
                    "message": "Mock Slack Alert sent, verify Slack Alert Received on your channel",
@ -8752,7 +9092,9 @@ async def health_services_endpoint(
                raise HTTPException(
                    status_code=422,
                    detail={
-                    "error": '"slack" not in proxy config: general_settings. Unable to test this.'
+                        "error": '"{}" not in proxy config: general_settings. Unable to test this.'.format(
+                            service
+                        )
                    },
                )
    except Exception as e:
@ -8761,7 +9103,7 @@ async def health_services_endpoint(
                message=getattr(e, "detail", f"Authentication Error({str(e)})"),
                type="auth_error",
                param=getattr(e, "param", "None"),
-                code=getattr(e, "status_code", status.HTTP_401_UNAUTHORIZED),
+                code=getattr(e, "status_code", status.HTTP_500_INTERNAL_SERVER_ERROR),
            )
        elif isinstance(e, ProxyException):
            raise e
@ -8769,7 +9111,7 @@ async def health_services_endpoint(
            message="Authentication Error, " + str(e),
            type="auth_error",
            param=getattr(e, "param", "None"),
-            code=status.HTTP_401_UNAUTHORIZED,
+            code=status.HTTP_500_INTERNAL_SERVER_ERROR,
        )


--- a/litellm/proxy/schema.prisma
+++ b/litellm/proxy/schema.prisma
@ -183,6 +183,21 @@ model LiteLLM_SpendLogs {
  end_user            String?
 }

+// View spend, model, api_key per request
+model LiteLLM_ErrorLogs {
+  request_id          String   @id @default(uuid())
+  startTime           DateTime // Assuming start_time is a DateTime field
+  endTime             DateTime // Assuming end_time is a DateTime field
+  api_base            String   @default("") 
+  model_group         String   @default("")      // public model_name / model_group
+  litellm_model_name  String   @default("")      // model passed to litellm
+  model_id            String   @default("")      // ID of model in ProxyModelTable
+  request_kwargs      Json     @default("{}")
+  exception_type      String   @default("")
+  exception_string    String   @default("")
+  status_code         String   @default("")
+}
+
 // Beta - allow team members to request access to a model
 model LiteLLM_UserNotifications {
  request_id          String @id
--- a/litellm/proxy/utils.py
+++ b/litellm/proxy/utils.py
@ -1,6 +1,6 @@
 from typing import Optional, List, Any, Literal, Union
 import os, subprocess, hashlib, importlib, asyncio, copy, json, aiohttp, httpx, time
-import litellm, backoff
+import litellm, backoff, traceback
 from litellm.proxy._types import (
    UserAPIKeyAuth,
    DynamoDBArgs,
@ -199,6 +199,33 @@ class ProxyLogging:
            print_verbose(f"final data being sent to {call_type} call: {data}")
            return data
        except Exception as e:
+            if "litellm_logging_obj" in data:
+                logging_obj: litellm.utils.Logging = data["litellm_logging_obj"]
+
+                ## ASYNC FAILURE HANDLER ##
+                error_message = ""
+                if isinstance(e, HTTPException):
+                    if isinstance(e.detail, str):
+                        error_message = e.detail
+                    elif isinstance(e.detail, dict):
+                        error_message = json.dumps(e.detail)
+                    else:
+                        error_message = str(e)
+                else:
+                    error_message = str(e)
+                error_raised = Exception(f"{error_message}")
+                await logging_obj.async_failure_handler(
+                    exception=error_raised,
+                    traceback_exception=traceback.format_exc(),
+                )
+
+                ## SYNC FAILURE HANDLER ##
+                try:
+                    logging_obj.failure_handler(
+                        error_raised, traceback.format_exc()
+                    )  # DO NOT MAKE THREADED - router retry fallback relies on this!
+                except Exception as error_val:
+                    pass
            raise e

    async def during_call_hook(
@ -256,7 +283,16 @@ class ProxyLogging:
        )

    async def alerting_handler(
-        self, message: str, level: Literal["Low", "Medium", "High"]
+        self,
+        message: str,
+        level: Literal["Low", "Medium", "High"],
+        alert_type: Literal[
+            "llm_exceptions",
+            "llm_too_slow",
+            "llm_requests_hanging",
+            "budget_alerts",
+            "db_exceptions",
+        ],
    ):
        """
        Alerting based on thresholds: - https://github.com/BerriAI/litellm/issues/1298
@ -289,7 +325,7 @@ class ProxyLogging:
        for client in self.alerting:
            if client == "slack":
                await self.slack_alerting_instance.send_alert(
-                    message=message, level=level
+                    message=message, level=level, alert_type=alert_type
                )
            elif client == "sentry":
                if litellm.utils.sentry_sdk_instance is not None:
@ -323,6 +359,7 @@ class ProxyLogging:
            self.alerting_handler(
                message=f"DB read/write call failed: {error_message}",
                level="High",
+                alert_type="db_exceptions",
            )
        )

@ -354,7 +391,9 @@ class ProxyLogging:
            return
        asyncio.create_task(
            self.alerting_handler(
-                message=f"LLM API call failed: {str(original_exception)}", level="High"
+                message=f"LLM API call failed: {str(original_exception)}",
+                level="High",
+                alert_type="llm_exceptions",
            )
        )

@ -1738,7 +1777,7 @@ def get_logging_payload(kwargs, response_obj, start_time, end_time):
    usage = response_obj["usage"]
    if type(usage) == litellm.Usage:
        usage = dict(usage)
-    id = response_obj.get("id", str(uuid.uuid4()))
+    id = response_obj.get("id", kwargs.get("litellm_call_id"))
    api_key = metadata.get("user_api_key", "")
    if api_key is not None and isinstance(api_key, str) and api_key.startswith("sk-"):
        # hash the api_key
@ -2010,6 +2049,11 @@ async def update_spend(
                raise e

    ### UPDATE KEY TABLE ###
+    verbose_proxy_logger.debug(
+        "KEY Spend transactions: {}".format(
+            len(prisma_client.key_list_transactons.keys())
+        )
+    )
    if len(prisma_client.key_list_transactons.keys()) > 0:
        for i in range(n_retry_times + 1):
            start_time = time.time()
--- a/litellm/router.py
+++ b/litellm/router.py
@ -50,7 +50,6 @@ class Router:
    model_names: List = []
    cache_responses: Optional[bool] = False
    default_cache_time_seconds: int = 1 * 60 * 60  # 1 hour
-    num_retries: int = 0
    tenacity = None
    leastbusy_logger: Optional[LeastBusyLoggingHandler] = None
    lowesttpm_logger: Optional[LowestTPMLoggingHandler] = None
@ -70,9 +69,11 @@ class Router:
        ] = None,  # if you want to cache across model groups
        client_ttl: int = 3600,  # ttl for cached clients - will re-initialize after this time in seconds
        ## RELIABILITY ##
-        num_retries: int = 0,
+        num_retries: Optional[int] = None,
        timeout: Optional[float] = None,
-        default_litellm_params={},  # default params for Router.chat.completion.create
+        default_litellm_params: Optional[
+            dict
+        ] = None,  # default params for Router.chat.completion.create
        default_max_parallel_requests: Optional[int] = None,
        set_verbose: bool = False,
        debug_level: Literal["DEBUG", "INFO"] = "INFO",
@ -158,6 +159,7 @@ class Router:
        router = Router(model_list=model_list, fallbacks=[{"azure-gpt-3.5-turbo": "openai-gpt-3.5-turbo"}])
        ```
        """
+
        if semaphore:
            self.semaphore = semaphore
        self.set_verbose = set_verbose
@ -229,7 +231,14 @@ class Router:
        self.failed_calls = (
            InMemoryCache()
        )  # cache to track failed call per deployment, if num failed calls within 1 minute > allowed fails, then add it to cooldown
-        self.num_retries = num_retries or litellm.num_retries or 0
+
+        if num_retries is not None:
+            self.num_retries = num_retries
+        elif litellm.num_retries is not None:
+            self.num_retries = litellm.num_retries
+        else:
+            self.num_retries = openai.DEFAULT_MAX_RETRIES
+
        self.timeout = timeout or litellm.request_timeout

        self.retry_after = retry_after
@ -255,6 +264,7 @@ class Router:
        )  # dict to store aliases for router, ex. {"gpt-4": "gpt-3.5-turbo"}, all requests with gpt-4 -> get routed to gpt-3.5-turbo group

        # make Router.chat.completions.create compatible for openai.chat.completions.create
+        default_litellm_params = default_litellm_params or {}
        self.chat = litellm.Chat(params=default_litellm_params, router_obj=self)

        # default litellm args
@ -280,6 +290,21 @@ class Router:
        }
        """
        ### ROUTING SETUP ###
+        self.routing_strategy_init(
+            routing_strategy=routing_strategy,
+            routing_strategy_args=routing_strategy_args,
+        )
+        ## COOLDOWNS ##
+        if isinstance(litellm.failure_callback, list):
+            litellm.failure_callback.append(self.deployment_callback_on_failure)
+        else:
+            litellm.failure_callback = [self.deployment_callback_on_failure]
+        print(  # noqa
+            f"Intialized router with Routing strategy: {self.routing_strategy}\n\nRouting fallbacks: {self.fallbacks}\n\nRouting context window fallbacks: {self.context_window_fallbacks}\n\nRouter Redis Caching={self.cache.redis_cache}"
+        )  # noqa
+        self.routing_strategy_args = routing_strategy_args
+
+    def routing_strategy_init(self, routing_strategy: str, routing_strategy_args: dict):
        if routing_strategy == "least-busy":
            self.leastbusy_logger = LeastBusyLoggingHandler(
                router_cache=self.cache, model_list=self.model_list
@ -311,15 +336,6 @@ class Router:
            )
            if isinstance(litellm.callbacks, list):
                litellm.callbacks.append(self.lowestlatency_logger)  # type: ignore
-        ## COOLDOWNS ##
-        if isinstance(litellm.failure_callback, list):
-            litellm.failure_callback.append(self.deployment_callback_on_failure)
-        else:
-            litellm.failure_callback = [self.deployment_callback_on_failure]
-        verbose_router_logger.info(
-            f"Intialized router with Routing strategy: {self.routing_strategy}\n\nRouting fallbacks: {self.fallbacks}\n\nRouting context window fallbacks: {self.context_window_fallbacks}\n\nRouter Redis Caching={self.cache.redis_cache}"
-        )
-        self.routing_strategy_args = routing_strategy_args

    def print_deployment(self, deployment: dict):
        """
@ -428,6 +444,7 @@ class Router:
            kwargs["messages"] = messages
            kwargs["original_function"] = self._acompletion
            kwargs["num_retries"] = kwargs.get("num_retries", self.num_retries)
+
            timeout = kwargs.get("request_timeout", self.timeout)
            kwargs.setdefault("metadata", {}).update({"model_group": model})

@ -469,6 +486,7 @@ class Router:
            )
            kwargs["model_info"] = deployment.get("model_info", {})
            data = deployment["litellm_params"].copy()
+
            model_name = data["model"]
            for k, v in self.default_litellm_params.items():
                if (
@ -1415,10 +1433,12 @@ class Router:
        context_window_fallbacks = kwargs.pop(
            "context_window_fallbacks", self.context_window_fallbacks
        )
-        verbose_router_logger.debug(
-            f"async function w/ retries: original_function - {original_function}"
-        )
+
        num_retries = kwargs.pop("num_retries")
+
+        verbose_router_logger.debug(
+            f"async function w/ retries: original_function - {original_function}, num_retries - {num_retries}"
+        )
        try:
            # if the function call is successful, no exception will be raised and we'll break out of the loop
            response = await original_function(*args, **kwargs)
@ -1436,37 +1456,47 @@ class Router:
                raise original_exception
            ### RETRY
            #### check if it should retry + back-off if required
-            if "No models available" in str(e):
-                timeout = litellm._calculate_retry_after(
-                    remaining_retries=num_retries,
-                    max_retries=num_retries,
-                    min_timeout=self.retry_after,
-                )
-                await asyncio.sleep(timeout)
-            elif RouterErrors.user_defined_ratelimit_error.value in str(e):
-                raise e  # don't wait to retry if deployment hits user-defined rate-limit
-            elif hasattr(original_exception, "status_code") and litellm._should_retry(
-                status_code=original_exception.status_code
-            ):
-                if hasattr(original_exception, "response") and hasattr(
-                    original_exception.response, "headers"
-                ):
-                    timeout = litellm._calculate_retry_after(
-                        remaining_retries=num_retries,
-                        max_retries=num_retries,
-                        response_headers=original_exception.response.headers,
-                        min_timeout=self.retry_after,
-                    )
-                else:
-                    timeout = litellm._calculate_retry_after(
-                        remaining_retries=num_retries,
-                        max_retries=num_retries,
-                        min_timeout=self.retry_after,
-                    )
-                await asyncio.sleep(timeout)
-            else:
-                raise original_exception
+            # if "No models available" in str(
+            #     e
+            # ) or RouterErrors.no_deployments_available.value in str(e):
+            #     timeout = litellm._calculate_retry_after(
+            #         remaining_retries=num_retries,
+            #         max_retries=num_retries,
+            #         min_timeout=self.retry_after,
+            #     )
+            #     await asyncio.sleep(timeout)
+            # elif RouterErrors.user_defined_ratelimit_error.value in str(e):
+            #     raise e  # don't wait to retry if deployment hits user-defined rate-limit

+            # elif hasattr(original_exception, "status_code") and litellm._should_retry(
+            #     status_code=original_exception.status_code
+            # ):
+            #     if hasattr(original_exception, "response") and hasattr(
+            #         original_exception.response, "headers"
+            #     ):
+            #         timeout = litellm._calculate_retry_after(
+            #             remaining_retries=num_retries,
+            #             max_retries=num_retries,
+            #             response_headers=original_exception.response.headers,
+            #             min_timeout=self.retry_after,
+            #         )
+            #     else:
+            #         timeout = litellm._calculate_retry_after(
+            #             remaining_retries=num_retries,
+            #             max_retries=num_retries,
+            #             min_timeout=self.retry_after,
+            #         )
+            #     await asyncio.sleep(timeout)
+            # else:
+            #     raise original_exception
+
+            ### RETRY
+            _timeout = self._router_should_retry(
+                e=original_exception,
+                remaining_retries=num_retries,
+                num_retries=num_retries,
+            )
+            await asyncio.sleep(_timeout)
            ## LOGGING
            if num_retries > 0:
                kwargs = self.log_retry(kwargs=kwargs, e=original_exception)
@ -1488,34 +1518,12 @@ class Router:
                    ## LOGGING
                    kwargs = self.log_retry(kwargs=kwargs, e=e)
                    remaining_retries = num_retries - current_attempt
-                    if "No models available" in str(e):
-                        timeout = litellm._calculate_retry_after(
+                    _timeout = self._router_should_retry(
+                        e=original_exception,
                        remaining_retries=remaining_retries,
-                            max_retries=num_retries,
-                            min_timeout=self.retry_after,
+                        num_retries=num_retries,
                    )
-                        await asyncio.sleep(timeout)
-                    elif (
-                        hasattr(e, "status_code")
-                        and hasattr(e, "response")
-                        and litellm._should_retry(status_code=e.status_code)
-                    ):
-                        if hasattr(e.response, "headers"):
-                            timeout = litellm._calculate_retry_after(
-                                remaining_retries=remaining_retries,
-                                max_retries=num_retries,
-                                response_headers=e.response.headers,
-                                min_timeout=self.retry_after,
-                            )
-                        else:
-                            timeout = litellm._calculate_retry_after(
-                                remaining_retries=remaining_retries,
-                                max_retries=num_retries,
-                                min_timeout=self.retry_after,
-                            )
-                        await asyncio.sleep(timeout)
-                    else:
-                        raise e
+                    await asyncio.sleep(_timeout)
            raise original_exception

    def function_with_fallbacks(self, *args, **kwargs):
@ -1606,6 +1614,27 @@ class Router:
                raise e
            raise original_exception

+    def _router_should_retry(
+        self, e: Exception, remaining_retries: int, num_retries: int
+    ) -> Union[int, float]:
+        """
+        Calculate back-off, then retry
+        """
+        if hasattr(e, "response") and hasattr(e.response, "headers"):
+            timeout = litellm._calculate_retry_after(
+                remaining_retries=remaining_retries,
+                max_retries=num_retries,
+                response_headers=e.response.headers,
+                min_timeout=self.retry_after,
+            )
+        else:
+            timeout = litellm._calculate_retry_after(
+                remaining_retries=remaining_retries,
+                max_retries=num_retries,
+                min_timeout=self.retry_after,
+            )
+        return timeout
+
    def function_with_retries(self, *args, **kwargs):
        """
        Try calling the model 3 times. Shuffle between available deployments.
@ -1619,15 +1648,13 @@ class Router:
        context_window_fallbacks = kwargs.pop(
            "context_window_fallbacks", self.context_window_fallbacks
        )
+
        try:
            # if the function call is successful, no exception will be raised and we'll break out of the loop
            response = original_function(*args, **kwargs)
            return response
        except Exception as e:
            original_exception = e
-            verbose_router_logger.debug(
-                f"num retries in function with retries: {num_retries}"
-            )
            ### CHECK IF RATE LIMIT / CONTEXT WINDOW ERROR
            if (
                isinstance(original_exception, litellm.ContextWindowExceededError)
@ -1641,6 +1668,12 @@ class Router:
            if num_retries > 0:
                kwargs = self.log_retry(kwargs=kwargs, e=original_exception)
            ### RETRY
+            _timeout = self._router_should_retry(
+                e=original_exception,
+                remaining_retries=num_retries,
+                num_retries=num_retries,
+            )
+            time.sleep(_timeout)
            for current_attempt in range(num_retries):
                verbose_router_logger.debug(
                    f"retrying request. Current attempt - {current_attempt}; retries left: {num_retries}"
@ -1654,34 +1687,12 @@ class Router:
                    ## LOGGING
                    kwargs = self.log_retry(kwargs=kwargs, e=e)
                    remaining_retries = num_retries - current_attempt
-                    if "No models available" in str(e):
-                        timeout = litellm._calculate_retry_after(
+                    _timeout = self._router_should_retry(
+                        e=e,
                        remaining_retries=remaining_retries,
-                            max_retries=num_retries,
-                            min_timeout=self.retry_after,
+                        num_retries=num_retries,
                    )
-                        time.sleep(timeout)
-                    elif (
-                        hasattr(e, "status_code")
-                        and hasattr(e, "response")
-                        and litellm._should_retry(status_code=e.status_code)
-                    ):
-                        if hasattr(e.response, "headers"):
-                            timeout = litellm._calculate_retry_after(
-                                remaining_retries=remaining_retries,
-                                max_retries=num_retries,
-                                response_headers=e.response.headers,
-                                min_timeout=self.retry_after,
-                            )
-                        else:
-                            timeout = litellm._calculate_retry_after(
-                                remaining_retries=remaining_retries,
-                                max_retries=num_retries,
-                                min_timeout=self.retry_after,
-                            )
-                        time.sleep(timeout)
-                    else:
-                        raise e
+                    time.sleep(_timeout)
            raise original_exception

    ### HELPER FUNCTIONS
@ -1715,10 +1726,11 @@ class Router:
            )  # i.e. azure
            metadata = kwargs.get("litellm_params", {}).get("metadata", None)
            _model_info = kwargs.get("litellm_params", {}).get("model_info", {})
+
            if isinstance(_model_info, dict):
                deployment_id = _model_info.get("id", None)
                self._set_cooldown_deployments(
-                    deployment_id
+                    exception_status=exception_status, deployment=deployment_id
                )  # setting deployment_id in cooldown deployments
            if custom_llm_provider:
                model_name = f"{custom_llm_provider}/{model_name}"
@ -1778,9 +1790,15 @@ class Router:
                key=rpm_key, value=request_count, local_only=True
            )  # don't change existing ttl

-    def _set_cooldown_deployments(self, deployment: Optional[str] = None):
+    def _set_cooldown_deployments(
+        self, exception_status: Union[str, int], deployment: Optional[str] = None
+    ):
        """
        Add a model to the list of models being cooled down for that minute, if it exceeds the allowed fails / minute
+
+        or
+
+        the exception is not one that should be immediately retried (e.g. 401)
        """
        if deployment is None:
            return
@ -1797,7 +1815,20 @@ class Router:
            f"Attempting to add {deployment} to cooldown list. updated_fails: {updated_fails}; self.allowed_fails: {self.allowed_fails}"
        )
        cooldown_time = self.cooldown_time or 1
-        if updated_fails > self.allowed_fails:
+
+        if isinstance(exception_status, str):
+            try:
+                exception_status = int(exception_status)
+            except Exception as e:
+                verbose_router_logger.debug(
+                    "Unable to cast exception status to int {}. Defaulting to status=500.".format(
+                        exception_status
+                    )
+                )
+                exception_status = 500
+        _should_retry = litellm._should_retry(status_code=exception_status)
+
+        if updated_fails > self.allowed_fails or _should_retry == False:
            # get the current cooldown list for that minute
            cooldown_key = f"{current_minute}:cooldown_models"  # group cooldown models by minute to reduce number of redis calls
            cached_value = self.cache.get_cache(key=cooldown_key)
@ -1929,6 +1960,7 @@ class Router:
            )
            default_api_base = api_base
            default_api_key = api_key
+
        if (
            model_name in litellm.open_ai_chat_completion_models
            or custom_llm_provider in litellm.openai_compatible_providers
@ -1940,8 +1972,10 @@ class Router:
            or "ft:gpt-3.5-turbo" in model_name
            or model_name in litellm.open_ai_embedding_models
        ):
+            is_azure_ai_studio_model: bool = False
            if custom_llm_provider == "azure":
                if litellm.utils._is_non_openai_azure_model(model_name):
+                    is_azure_ai_studio_model = True
                    custom_llm_provider = "openai"
                    # remove azure prefx from model_name
                    model_name = model_name.replace("azure/", "")
@ -1964,6 +1998,25 @@ class Router:
                api_base = litellm.get_secret(api_base_env_name)
                litellm_params["api_base"] = api_base

+            ## AZURE AI STUDIO MISTRAL CHECK ##
+            """
+            Make sure api base ends in /v1/
+
+            if not, add it - https://github.com/BerriAI/litellm/issues/2279
+            """
+            if (
+                is_azure_ai_studio_model == True
+                and api_base is not None
+                and not api_base.endswith("/v1/")
+            ):
+                # check if it ends with a trailing slash
+                if api_base.endswith("/"):
+                    api_base += "v1/"
+                elif api_base.endswith("/v1"):
+                    api_base += "/"
+                else:
+                    api_base += "/v1/"
+
            api_version = litellm_params.get("api_version")
            if api_version and api_version.startswith("os.environ/"):
                api_version_env_name = api_version.replace("os.environ/", "")
@ -1986,7 +2039,9 @@ class Router:
                stream_timeout = litellm.get_secret(stream_timeout_env_name)
                litellm_params["stream_timeout"] = stream_timeout

-            max_retries = litellm_params.pop("max_retries", 2)
+            max_retries = litellm_params.pop(
+                "max_retries", 0
+            )  # router handles retry logic
            if isinstance(max_retries, str) and max_retries.startswith("os.environ/"):
                max_retries_env_name = max_retries.replace("os.environ/", "")
                max_retries = litellm.get_secret(max_retries_env_name)
@ -2052,10 +2107,12 @@ class Router:
                        timeout=timeout,
                        max_retries=max_retries,
                        http_client=httpx.AsyncClient(
-                            transport=AsyncCustomHTTPTransport(),
+                            transport=AsyncCustomHTTPTransport(
                                limits=httpx.Limits(
                                    max_connections=1000, max_keepalive_connections=100
                                ),
+                                verify=litellm.ssl_verify,
+                            ),
                            mounts=async_proxy_mounts,
                        ),  # type: ignore
                    )
@ -2074,10 +2131,12 @@ class Router:
                        timeout=timeout,
                        max_retries=max_retries,
                        http_client=httpx.Client(
-                            transport=CustomHTTPTransport(),
+                            transport=CustomHTTPTransport(
                                limits=httpx.Limits(
                                    max_connections=1000, max_keepalive_connections=100
                                ),
+                                verify=litellm.ssl_verify,
+                            ),
                            mounts=sync_proxy_mounts,
                        ),  # type: ignore
                    )
@ -2096,10 +2155,12 @@ class Router:
                        timeout=stream_timeout,
                        max_retries=max_retries,
                        http_client=httpx.AsyncClient(
-                            transport=AsyncCustomHTTPTransport(),
+                            transport=AsyncCustomHTTPTransport(
                                limits=httpx.Limits(
                                    max_connections=1000, max_keepalive_connections=100
                                ),
+                                verify=litellm.ssl_verify,
+                            ),
                            mounts=async_proxy_mounts,
                        ),  # type: ignore
                    )
@ -2118,10 +2179,12 @@ class Router:
                        timeout=stream_timeout,
                        max_retries=max_retries,
                        http_client=httpx.Client(
-                            transport=CustomHTTPTransport(),
+                            transport=CustomHTTPTransport(
                                limits=httpx.Limits(
                                    max_connections=1000, max_keepalive_connections=100
                                ),
+                                verify=litellm.ssl_verify,
+                            ),
                            mounts=sync_proxy_mounts,
                        ),  # type: ignore
                    )
@ -2158,10 +2221,12 @@ class Router:
                        timeout=timeout,
                        max_retries=max_retries,
                        http_client=httpx.AsyncClient(
-                            transport=AsyncCustomHTTPTransport(),
+                            transport=AsyncCustomHTTPTransport(
                                limits=httpx.Limits(
                                    max_connections=1000, max_keepalive_connections=100
                                ),
+                                verify=litellm.ssl_verify,
+                            ),
                            mounts=async_proxy_mounts,
                        ),  # type: ignore
                    )
@ -2178,10 +2243,12 @@ class Router:
                        timeout=timeout,
                        max_retries=max_retries,
                        http_client=httpx.Client(
-                            transport=CustomHTTPTransport(),
+                            transport=CustomHTTPTransport(
+                                verify=litellm.ssl_verify,
                                limits=httpx.Limits(
                                    max_connections=1000, max_keepalive_connections=100
                                ),
+                            ),
                            mounts=sync_proxy_mounts,
                        ),  # type: ignore
                    )
@ -2199,10 +2266,12 @@ class Router:
                        timeout=stream_timeout,
                        max_retries=max_retries,
                        http_client=httpx.AsyncClient(
-                            transport=AsyncCustomHTTPTransport(),
+                            transport=AsyncCustomHTTPTransport(
                                limits=httpx.Limits(
                                    max_connections=1000, max_keepalive_connections=100
                                ),
+                                verify=litellm.ssl_verify,
+                            ),
                            mounts=async_proxy_mounts,
                        ),
                    )
@ -2219,10 +2288,12 @@ class Router:
                        timeout=stream_timeout,
                        max_retries=max_retries,
                        http_client=httpx.Client(
-                            transport=CustomHTTPTransport(),
+                            transport=CustomHTTPTransport(
                                limits=httpx.Limits(
                                    max_connections=1000, max_keepalive_connections=100
                                ),
+                                verify=litellm.ssl_verify,
+                            ),
                            mounts=sync_proxy_mounts,
                        ),
                    )
@ -2249,10 +2320,12 @@ class Router:
                    max_retries=max_retries,
                    organization=organization,
                    http_client=httpx.AsyncClient(
-                        transport=AsyncCustomHTTPTransport(),
+                        transport=AsyncCustomHTTPTransport(
                            limits=httpx.Limits(
                                max_connections=1000, max_keepalive_connections=100
                            ),
+                            verify=litellm.ssl_verify,
+                        ),
                        mounts=async_proxy_mounts,
                    ),  # type: ignore
                )
@ -2271,10 +2344,12 @@ class Router:
                    max_retries=max_retries,
                    organization=organization,
                    http_client=httpx.Client(
-                        transport=CustomHTTPTransport(),
+                        transport=CustomHTTPTransport(
                            limits=httpx.Limits(
                                max_connections=1000, max_keepalive_connections=100
                            ),
+                            verify=litellm.ssl_verify,
+                        ),
                        mounts=sync_proxy_mounts,
                    ),  # type: ignore
                )
@ -2294,10 +2369,12 @@ class Router:
                    max_retries=max_retries,
                    organization=organization,
                    http_client=httpx.AsyncClient(
-                        transport=AsyncCustomHTTPTransport(),
+                        transport=AsyncCustomHTTPTransport(
                            limits=httpx.Limits(
                                max_connections=1000, max_keepalive_connections=100
                            ),
+                            verify=litellm.ssl_verify,
+                        ),
                        mounts=async_proxy_mounts,
                    ),  # type: ignore
                )
@ -2317,10 +2394,12 @@ class Router:
                    max_retries=max_retries,
                    organization=organization,
                    http_client=httpx.Client(
-                        transport=CustomHTTPTransport(),
+                        transport=CustomHTTPTransport(
                            limits=httpx.Limits(
                                max_connections=1000, max_keepalive_connections=100
                            ),
+                            verify=litellm.ssl_verify,
+                        ),
                        mounts=sync_proxy_mounts,
                    ),  # type: ignore
                )
@ -2550,6 +2629,11 @@ class Router:
        for var in vars_to_include:
            if var in _all_vars:
                _settings_to_return[var] = _all_vars[var]
+            if (
+                var == "routing_strategy_args"
+                and self.routing_strategy == "latency-based-routing"
+            ):
+                _settings_to_return[var] = self.lowestlatency_logger.routing_args.json()
        return _settings_to_return

    def update_settings(self, **kwargs):
@ -2581,6 +2665,13 @@ class Router:
                    _casted_value = int(kwargs[var])
                    setattr(self, var, _casted_value)
                else:
+                    if var == "routing_strategy":
+                        self.routing_strategy_init(
+                            routing_strategy=kwargs[var],
+                            routing_strategy_args=kwargs.get(
+                                "routing_strategy_args", {}
+                            ),
+                        )
                    setattr(self, var, kwargs[var])
            else:
                verbose_router_logger.debug("Setting {} is not allowed".format(var))
@ -2717,7 +2808,10 @@ class Router:
                self.cache.get_cache(key=model_id, local_only=True) or 0
            )
            ### get usage based cache ###
-            if isinstance(model_group_cache, dict):
+            if (
+                isinstance(model_group_cache, dict)
+                and self.routing_strategy != "usage-based-routing-v2"
+            ):
                model_group_cache[model_id] = model_group_cache.get(model_id, 0)

                current_request = max(
@ -2745,7 +2839,7 @@ class Router:

            if _rate_limit_error == True:  # allow generic fallback logic to take place
                raise ValueError(
-                    f"No deployments available for selected model, passed model={model}"
+                    f"{RouterErrors.no_deployments_available.value}, passed model={model}"
                )
            elif _context_window_error == True:
                raise litellm.ContextWindowExceededError(
@ -2883,6 +2977,11 @@ class Router:
                model=model, healthy_deployments=healthy_deployments, messages=messages
            )

+        if len(healthy_deployments) == 0:
+            raise ValueError(
+                f"{RouterErrors.no_deployments_available.value}, passed model={model}"
+            )
+
        if (
            self.routing_strategy == "usage-based-routing-v2"
            and self.lowesttpm_logger_v2 is not None
@ -2938,7 +3037,7 @@ class Router:
                f"get_available_deployment for model: {model}, No deployment available"
            )
            raise ValueError(
-                f"No deployments available for selected model, passed model={model}"
+                f"{RouterErrors.no_deployments_available.value}, passed model={model}"
            )
        verbose_router_logger.info(
            f"get_available_deployment for model: {model}, Selected deployment: {self.print_deployment(deployment)} for model: {model}"
@ -3068,7 +3167,7 @@ class Router:
                f"get_available_deployment for model: {model}, No deployment available"
            )
            raise ValueError(
-                f"No deployments available for selected model, passed model={model}"
+                f"{RouterErrors.no_deployments_available.value}, passed model={model}"
            )
        verbose_router_logger.info(
            f"get_available_deployment for model: {model}, Selected deployment: {self.print_deployment(deployment)} for model: {model}"
--- a/litellm/router_strategy/lowest_latency.py
+++ b/litellm/router_strategy/lowest_latency.py
@ -4,6 +4,7 @@ from pydantic import BaseModel, Extra, Field, root_validator
 import dotenv, os, requests, random
 from typing import Optional, Union, List, Dict
 from datetime import datetime, timedelta
+import random

 dotenv.load_dotenv()  # Loading env variables using dotenv
 import traceback
@ -29,6 +30,7 @@ class LiteLLMBase(BaseModel):

 class RoutingArgs(LiteLLMBase):
    ttl: int = 1 * 60 * 60  # 1 hour
+    lowest_latency_buffer: float = 0


 class LowestLatencyLoggingHandler(CustomLogger):
@ -312,6 +314,14 @@ class LowestLatencyLoggingHandler(CustomLogger):
        except:
            input_tokens = 0

+        # randomly sample from all_deployments, incase all deployments have latency=0.0
+        _items = all_deployments.items()
+
+        all_deployments = random.sample(list(_items), len(_items))
+        all_deployments = dict(all_deployments)
+        ### GET AVAILABLE DEPLOYMENTS ### filter out any deployments > tpm/rpm limits
+
+        potential_deployments = []
        for item, item_map in all_deployments.items():
            ## get the item from model list
            _deployment = None
@ -345,23 +355,48 @@ class LowestLatencyLoggingHandler(CustomLogger):
                if isinstance(_call_latency, float):
                    total += _call_latency
            item_latency = total / len(item_latency)
-            if item_latency == 0:
-                deployment = _deployment
-                break
-            elif (
+
+            # -------------- #
+            # Debugging Logic
+            # -------------- #
+            # We use _latency_per_deployment to log to langfuse, slack - this is not used to make a decision on routing
+            # this helps a user to debug why the router picked a specfic deployment      #
+            _deployment_api_base = _deployment.get("litellm_params", {}).get(
+                "api_base", ""
+            )
+            if _deployment_api_base is not None:
+                _latency_per_deployment[_deployment_api_base] = item_latency
+            # -------------- #
+            # End of Debugging Logic
+            # -------------- #
+
+            if (
                item_tpm + input_tokens > _deployment_tpm
                or item_rpm + 1 > _deployment_rpm
            ):  # if user passed in tpm / rpm in the model_list
                continue
-            elif item_latency < lowest_latency:
-                lowest_latency = item_latency
-                deployment = _deployment
+            else:
+                potential_deployments.append((_deployment, item_latency))
+
+        if len(potential_deployments) == 0:
+            return None
+
+        # Sort potential deployments by latency
+        sorted_deployments = sorted(potential_deployments, key=lambda x: x[1])
+
+        # Find lowest latency deployment
+        lowest_latency = sorted_deployments[0][1]
+
+        # Find deployments within buffer of lowest latency
+        buffer = self.routing_args.lowest_latency_buffer * lowest_latency
+        valid_deployments = [
+            x for x in sorted_deployments if x[1] <= lowest_latency + buffer
+        ]
+
+        # Pick a random deployment from valid deployments
+        random_valid_deployment = random.choice(valid_deployments)
+        deployment = random_valid_deployment[0]

-            # _latency_per_deployment is used for debuggig
-            _deployment_api_base = _deployment.get("litellm_params", {}).get(
-                "api_base", ""
-            )
-            _latency_per_deployment[_deployment_api_base] = item_latency
        if request_kwargs is not None and "metadata" in request_kwargs:
            request_kwargs["metadata"][
                "_latency_per_deployment"
--- a/litellm/router_strategy/lowest_tpm_rpm.py
+++ b/litellm/router_strategy/lowest_tpm_rpm.py
@ -206,7 +206,7 @@ class LowestTPMLoggingHandler(CustomLogger):
            if item_tpm + input_tokens > _deployment_tpm:
                continue
            elif (rpm_dict is not None and item in rpm_dict) and (
-                rpm_dict[item] + 1 > _deployment_rpm
+                rpm_dict[item] + 1 >= _deployment_rpm
            ):
                continue
            elif item_tpm < lowest_tpm:
--- a/litellm/router_strategy/lowest_tpm_rpm_v2.py
+++ b/litellm/router_strategy/lowest_tpm_rpm_v2.py
@ -333,7 +333,7 @@ class LowestTPMLoggingHandler_v2(CustomLogger):
                    tpm_dict[tpm_key] = 0

        all_deployments = tpm_dict
-        deployment = None
+        potential_deployments = []  # if multiple deployments have the same low value
        for item, item_tpm in all_deployments.items():
            ## get the item from model list
            _deployment = None
@ -343,6 +343,8 @@ class LowestTPMLoggingHandler_v2(CustomLogger):
                    _deployment = m
            if _deployment is None:
                continue  # skip to next one
+            elif item_tpm is None:
+                continue  # skip if unhealthy deployment

            _deployment_tpm = None
            if _deployment_tpm is None:
@ -366,14 +368,20 @@ class LowestTPMLoggingHandler_v2(CustomLogger):
            if item_tpm + input_tokens > _deployment_tpm:
                continue
            elif (rpm_dict is not None and item in rpm_dict) and (
-                rpm_dict[item] + 1 > _deployment_rpm
+                rpm_dict[item] + 1 >= _deployment_rpm
            ):
                continue
+            elif item_tpm == lowest_tpm:
+                potential_deployments.append(_deployment)
            elif item_tpm < lowest_tpm:
                lowest_tpm = item_tpm
-                deployment = _deployment
+                potential_deployments = [_deployment]
        print_verbose("returning picked lowest tpm/rpm deployment.")
-        return deployment
+
+        if len(potential_deployments) > 0:
+            return random.choice(potential_deployments)
+        else:
+            return None

    async def async_get_available_deployments(
        self,
@ -394,6 +402,7 @@ class LowestTPMLoggingHandler_v2(CustomLogger):

        dt = get_utc_datetime()
        current_minute = dt.strftime("%H-%M")
+
        tpm_keys = []
        rpm_keys = []
        for m in healthy_deployments:
@ -416,7 +425,7 @@ class LowestTPMLoggingHandler_v2(CustomLogger):
        tpm_values = combined_tpm_rpm_values[: len(tpm_keys)]
        rpm_values = combined_tpm_rpm_values[len(tpm_keys) :]

-        return self._common_checks_available_deployment(
+        deployment = self._common_checks_available_deployment(
            model_group=model_group,
            healthy_deployments=healthy_deployments,
            tpm_keys=tpm_keys,
@ -427,6 +436,61 @@ class LowestTPMLoggingHandler_v2(CustomLogger):
            input=input,
        )

+        try:
+            assert deployment is not None
+            return deployment
+        except Exception as e:
+            ### GET THE DICT OF TPM / RPM + LIMITS PER DEPLOYMENT ###
+            deployment_dict = {}
+            for index, _deployment in enumerate(healthy_deployments):
+                if isinstance(_deployment, dict):
+                    id = _deployment.get("model_info", {}).get("id")
+                    ### GET DEPLOYMENT TPM LIMIT ###
+                    _deployment_tpm = None
+                    if _deployment_tpm is None:
+                        _deployment_tpm = _deployment.get("tpm", None)
+                    if _deployment_tpm is None:
+                        _deployment_tpm = _deployment.get("litellm_params", {}).get(
+                            "tpm", None
+                        )
+                    if _deployment_tpm is None:
+                        _deployment_tpm = _deployment.get("model_info", {}).get(
+                            "tpm", None
+                        )
+                    if _deployment_tpm is None:
+                        _deployment_tpm = float("inf")
+
+                    ### GET CURRENT TPM ###
+                    current_tpm = tpm_values[index]
+
+                    ### GET DEPLOYMENT TPM LIMIT ###
+                    _deployment_rpm = None
+                    if _deployment_rpm is None:
+                        _deployment_rpm = _deployment.get("rpm", None)
+                    if _deployment_rpm is None:
+                        _deployment_rpm = _deployment.get("litellm_params", {}).get(
+                            "rpm", None
+                        )
+                    if _deployment_rpm is None:
+                        _deployment_rpm = _deployment.get("model_info", {}).get(
+                            "rpm", None
+                        )
+                    if _deployment_rpm is None:
+                        _deployment_rpm = float("inf")
+
+                    ### GET CURRENT RPM ###
+                    current_rpm = rpm_values[index]
+
+                    deployment_dict[id] = {
+                        "current_tpm": current_tpm,
+                        "tpm_limit": _deployment_tpm,
+                        "current_rpm": current_rpm,
+                        "rpm_limit": _deployment_rpm,
+                    }
+            raise ValueError(
+                f"{RouterErrors.no_deployments_available.value}. Passed model={model_group}. Deployments={deployment_dict}"
+            )
+
    def get_available_deployments(
        self,
        model_group: str,
@ -464,7 +528,7 @@ class LowestTPMLoggingHandler_v2(CustomLogger):
            keys=rpm_keys
        )  # [1, 2, None, ..]

-        return self._common_checks_available_deployment(
+        deployment = self._common_checks_available_deployment(
            model_group=model_group,
            healthy_deployments=healthy_deployments,
            tpm_keys=tpm_keys,
@ -474,3 +538,58 @@ class LowestTPMLoggingHandler_v2(CustomLogger):
            messages=messages,
            input=input,
        )
+
+        try:
+            assert deployment is not None
+            return deployment
+        except Exception as e:
+            ### GET THE DICT OF TPM / RPM + LIMITS PER DEPLOYMENT ###
+            deployment_dict = {}
+            for index, _deployment in enumerate(healthy_deployments):
+                if isinstance(_deployment, dict):
+                    id = _deployment.get("model_info", {}).get("id")
+                    ### GET DEPLOYMENT TPM LIMIT ###
+                    _deployment_tpm = None
+                    if _deployment_tpm is None:
+                        _deployment_tpm = _deployment.get("tpm", None)
+                    if _deployment_tpm is None:
+                        _deployment_tpm = _deployment.get("litellm_params", {}).get(
+                            "tpm", None
+                        )
+                    if _deployment_tpm is None:
+                        _deployment_tpm = _deployment.get("model_info", {}).get(
+                            "tpm", None
+                        )
+                    if _deployment_tpm is None:
+                        _deployment_tpm = float("inf")
+
+                    ### GET CURRENT TPM ###
+                    current_tpm = tpm_values[index]
+
+                    ### GET DEPLOYMENT TPM LIMIT ###
+                    _deployment_rpm = None
+                    if _deployment_rpm is None:
+                        _deployment_rpm = _deployment.get("rpm", None)
+                    if _deployment_rpm is None:
+                        _deployment_rpm = _deployment.get("litellm_params", {}).get(
+                            "rpm", None
+                        )
+                    if _deployment_rpm is None:
+                        _deployment_rpm = _deployment.get("model_info", {}).get(
+                            "rpm", None
+                        )
+                    if _deployment_rpm is None:
+                        _deployment_rpm = float("inf")
+
+                    ### GET CURRENT RPM ###
+                    current_rpm = rpm_values[index]
+
+                    deployment_dict[id] = {
+                        "current_tpm": current_tpm,
+                        "tpm_limit": _deployment_tpm,
+                        "current_rpm": current_rpm,
+                        "rpm_limit": _deployment_rpm,
+                    }
+            raise ValueError(
+                f"{RouterErrors.no_deployments_available.value}. Passed model={model_group}. Deployments={deployment_dict}"
+            )
--- a/litellm/tests/conftest.py
+++ b/litellm/tests/conftest.py
@ -19,6 +19,7 @@ def setup_and_teardown():
        0, os.path.abspath("../..")
    )  # Adds the project directory to the system path
    import litellm
+    from litellm import Router

    importlib.reload(litellm)
    import asyncio
--- a/litellm/tests/test_acooldowns_router.py
+++ b/litellm/tests/test_acooldowns_router.py
@ -119,7 +119,9 @@ def test_multiple_deployments_parallel():


 # test_multiple_deployments_parallel()
-def test_cooldown_same_model_name():
+@pytest.mark.parametrize("sync_mode", [True, False])
+@pytest.mark.asyncio
+async def test_cooldown_same_model_name(sync_mode):
    # users could have the same model with different api_base
    # example
    # azure/chatgpt, api_base: 1234
@ -161,6 +163,7 @@ def test_cooldown_same_model_name():
            num_retries=3,
        )  # type: ignore

+        if sync_mode:
            response = router.completion(
                model="gpt-3.5-turbo",
                messages=[{"role": "user", "content": "hello this request will pass"}],
@ -176,6 +179,23 @@ def test_cooldown_same_model_name():
                model_ids[0] != model_ids[1]
            )  # ensure both models have a uuid added, and they have different names

+            print("\ngot response\n", response)
+        else:
+            response = await router.acompletion(
+                model="gpt-3.5-turbo",
+                messages=[{"role": "user", "content": "hello this request will pass"}],
+            )
+            print(router.model_list)
+            model_ids = []
+            for model in router.model_list:
+                model_ids.append(model["model_info"]["id"])
+            print("\n litellm model ids ", model_ids)
+
+            # example litellm_model_names ['azure/chatgpt-v-2-ModelID-64321', 'azure/chatgpt-v-2-ModelID-63960']
+            assert (
+                model_ids[0] != model_ids[1]
+            )  # ensure both models have a uuid added, and they have different names
+
            print("\ngot response\n", response)
    except Exception as e:
        pytest.fail(f"Got unexpected exception on router! - {e}")
--- a/litellm/tests/test_alangfuse.py
+++ b/litellm/tests/test_alangfuse.py
@ -161,40 +161,56 @@ async def make_async_calls():
    return total_time


-# def test_langfuse_logging_async_text_completion():
-#     try:
-#         pre_langfuse_setup()
-#         litellm.set_verbose = False
-#         litellm.success_callback = ["langfuse"]
+@pytest.mark.asyncio
+@pytest.mark.parametrize("stream", [False, True])
+async def test_langfuse_logging_without_request_response(stream):
+    try:
+        import uuid

-#         async def _test_langfuse():
-#             response = await litellm.atext_completion(
-#                 model="gpt-3.5-turbo-instruct",
-#                 prompt="this is a test",
-#                 max_tokens=5,
-#                 temperature=0.7,
-#                 timeout=5,
-#                 user="test_user",
-#                 stream=True
-#             )
-#             async for chunk in response:
-#                 print()
-#                 print(chunk)
-#             await asyncio.sleep(1)
-#             return response
+        _unique_trace_name = f"litellm-test-{str(uuid.uuid4())}"
+        litellm.set_verbose = True
+        litellm.turn_off_message_logging = True
+        litellm.success_callback = ["langfuse"]
+        response = await litellm.acompletion(
+            model="gpt-3.5-turbo",
+            mock_response="It's simple to use and easy to get started",
+            messages=[{"role": "user", "content": "Hi 👋 - i'm claude"}],
+            max_tokens=10,
+            temperature=0.2,
+            stream=stream,
+            metadata={"trace_id": _unique_trace_name},
+        )
+        print(response)
+        if stream:
+            async for chunk in response:
+                print(chunk)

-#         response = asyncio.run(_test_langfuse())
-#         print(f"response: {response}")
+        await asyncio.sleep(3)

-#         # # check langfuse.log to see if there was a failed response
-#         search_logs("langfuse.log")
-#     except litellm.Timeout as e:
-#         pass
-#     except Exception as e:
-#         pytest.fail(f"An exception occurred - {e}")
+        import langfuse

+        langfuse_client = langfuse.Langfuse(
+            public_key=os.environ["LANGFUSE_PUBLIC_KEY"],
+            secret_key=os.environ["LANGFUSE_SECRET_KEY"],
+        )

-# test_langfuse_logging_async_text_completion()
+        # get trace with _unique_trace_name
+        trace = langfuse_client.get_generations(trace_id=_unique_trace_name)
+
+        print("trace_from_langfuse", trace)
+
+        _trace_data = trace.data
+
+        assert _trace_data[0].input == {"messages": "redacted-by-litellm"}
+        assert _trace_data[0].output == {
+            "role": "assistant",
+            "content": "redacted-by-litellm",
+            "function_call": None,
+            "tool_calls": None,
+        }
+
+    except Exception as e:
+        pytest.fail(f"An exception occurred - {e}")


@pytest.mark.skip(reason="beta test - checking langfuse output")
@ -334,6 +350,220 @@ def test_langfuse_logging_function_calling():
 # test_langfuse_logging_function_calling()


+def test_langfuse_existing_trace_id():
+    """
+    When existing trace id is passed, don't set trace params -> prevents overwriting the trace
+
+    Pass 1 logging object with a trace
+
+    Pass 2nd logging object with the trace id
+
+    Assert no changes to the trace
+    """
+    # Test - if the logs were sent to the correct team on langfuse
+    import litellm, datetime
+    from litellm.integrations.langfuse import LangFuseLogger
+
+    langfuse_Logger = LangFuseLogger(
+        langfuse_public_key=os.getenv("LANGFUSE_PROJECT2_PUBLIC"),
+        langfuse_secret=os.getenv("LANGFUSE_PROJECT2_SECRET"),
+    )
+    litellm.success_callback = ["langfuse"]
+
+    # langfuse_args = {'kwargs': { 'start_time':  'end_time': datetime.datetime(2024, 5, 1, 7, 31, 29, 903685), 'user_id': None, 'print_verbose': <function print_verbose at 0x109d1f420>, 'level': 'DEFAULT', 'status_message': None}
+    response_obj = litellm.ModelResponse(
+        id="chatcmpl-9K5HUAbVRqFrMZKXL0WoC295xhguY",
+        choices=[
+            litellm.Choices(
+                finish_reason="stop",
+                index=0,
+                message=litellm.Message(
+                    content="I'm sorry, I am an AI assistant and do not have real-time information. I recommend checking a reliable weather website or app for the most up-to-date weather information in Boston.",
+                    role="assistant",
+                ),
+            )
+        ],
+        created=1714573888,
+        model="gpt-3.5-turbo-0125",
+        object="chat.completion",
+        system_fingerprint="fp_3b956da36b",
+        usage=litellm.Usage(completion_tokens=37, prompt_tokens=14, total_tokens=51),
+    )
+
+    ### NEW TRACE ###
+    message = [{"role": "user", "content": "what's the weather in boston"}]
+    langfuse_args = {
+        "response_obj": response_obj,
+        "kwargs": {
+            "model": "gpt-3.5-turbo",
+            "litellm_params": {
+                "acompletion": False,
+                "api_key": None,
+                "force_timeout": 600,
+                "logger_fn": None,
+                "verbose": False,
+                "custom_llm_provider": "openai",
+                "api_base": "https://api.openai.com/v1/",
+                "litellm_call_id": "508113a1-c6f1-48ce-a3e1-01c6cce9330e",
+                "model_alias_map": {},
+                "completion_call_id": None,
+                "metadata": None,
+                "model_info": None,
+                "proxy_server_request": None,
+                "preset_cache_key": None,
+                "no-log": False,
+                "stream_response": {},
+            },
+            "messages": message,
+            "optional_params": {"temperature": 0.1, "extra_body": {}},
+            "start_time": "2024-05-01 07:31:27.986164",
+            "stream": False,
+            "user": None,
+            "call_type": "completion",
+            "litellm_call_id": "508113a1-c6f1-48ce-a3e1-01c6cce9330e",
+            "completion_start_time": "2024-05-01 07:31:29.903685",
+            "temperature": 0.1,
+            "extra_body": {},
+            "input": [{"role": "user", "content": "what's the weather in boston"}],
+            "api_key": "my-api-key",
+            "additional_args": {
+                "complete_input_dict": {
+                    "model": "gpt-3.5-turbo",
+                    "messages": [
+                        {"role": "user", "content": "what's the weather in boston"}
+                    ],
+                    "temperature": 0.1,
+                    "extra_body": {},
+                }
+            },
+            "log_event_type": "successful_api_call",
+            "end_time": "2024-05-01 07:31:29.903685",
+            "cache_hit": None,
+            "response_cost": 6.25e-05,
+        },
+        "start_time": datetime.datetime(2024, 5, 1, 7, 31, 27, 986164),
+        "end_time": datetime.datetime(2024, 5, 1, 7, 31, 29, 903685),
+        "user_id": None,
+        "print_verbose": litellm.print_verbose,
+        "level": "DEFAULT",
+        "status_message": None,
+    }
+
+    langfuse_response_object = langfuse_Logger.log_event(**langfuse_args)
+
+    import langfuse
+
+    langfuse_client = langfuse.Langfuse(
+        public_key=os.getenv("LANGFUSE_PROJECT2_PUBLIC"),
+        secret_key=os.getenv("LANGFUSE_PROJECT2_SECRET"),
+    )
+
+    trace_id = langfuse_response_object["trace_id"]
+
+    langfuse_client.flush()
+
+    time.sleep(2)
+
+    print(langfuse_client.get_trace(id=trace_id))
+
+    initial_langfuse_trace = langfuse_client.get_trace(id=trace_id)
+
+    ### EXISTING TRACE ###
+
+    new_metadata = {"existing_trace_id": trace_id}
+    new_messages = [{"role": "user", "content": "What do you know?"}]
+    new_response_obj = litellm.ModelResponse(
+        id="chatcmpl-9K5HUAbVRqFrMZKXL0WoC295xhguY",
+        choices=[
+            litellm.Choices(
+                finish_reason="stop",
+                index=0,
+                message=litellm.Message(
+                    content="What do I know?",
+                    role="assistant",
+                ),
+            )
+        ],
+        created=1714573888,
+        model="gpt-3.5-turbo-0125",
+        object="chat.completion",
+        system_fingerprint="fp_3b956da36b",
+        usage=litellm.Usage(completion_tokens=37, prompt_tokens=14, total_tokens=51),
+    )
+    langfuse_args = {
+        "response_obj": new_response_obj,
+        "kwargs": {
+            "model": "gpt-3.5-turbo",
+            "litellm_params": {
+                "acompletion": False,
+                "api_key": None,
+                "force_timeout": 600,
+                "logger_fn": None,
+                "verbose": False,
+                "custom_llm_provider": "openai",
+                "api_base": "https://api.openai.com/v1/",
+                "litellm_call_id": "508113a1-c6f1-48ce-a3e1-01c6cce9330e",
+                "model_alias_map": {},
+                "completion_call_id": None,
+                "metadata": new_metadata,
+                "model_info": None,
+                "proxy_server_request": None,
+                "preset_cache_key": None,
+                "no-log": False,
+                "stream_response": {},
+            },
+            "messages": new_messages,
+            "optional_params": {"temperature": 0.1, "extra_body": {}},
+            "start_time": "2024-05-01 07:31:27.986164",
+            "stream": False,
+            "user": None,
+            "call_type": "completion",
+            "litellm_call_id": "508113a1-c6f1-48ce-a3e1-01c6cce9330e",
+            "completion_start_time": "2024-05-01 07:31:29.903685",
+            "temperature": 0.1,
+            "extra_body": {},
+            "input": [{"role": "user", "content": "what's the weather in boston"}],
+            "api_key": "my-api-key",
+            "additional_args": {
+                "complete_input_dict": {
+                    "model": "gpt-3.5-turbo",
+                    "messages": [
+                        {"role": "user", "content": "what's the weather in boston"}
+                    ],
+                    "temperature": 0.1,
+                    "extra_body": {},
+                }
+            },
+            "log_event_type": "successful_api_call",
+            "end_time": "2024-05-01 07:31:29.903685",
+            "cache_hit": None,
+            "response_cost": 6.25e-05,
+        },
+        "start_time": datetime.datetime(2024, 5, 1, 7, 31, 27, 986164),
+        "end_time": datetime.datetime(2024, 5, 1, 7, 31, 29, 903685),
+        "user_id": None,
+        "print_verbose": litellm.print_verbose,
+        "level": "DEFAULT",
+        "status_message": None,
+    }
+
+    langfuse_response_object = langfuse_Logger.log_event(**langfuse_args)
+
+    new_trace_id = langfuse_response_object["trace_id"]
+
+    assert new_trace_id == trace_id
+
+    langfuse_client.flush()
+
+    time.sleep(2)
+
+    print(langfuse_client.get_trace(id=trace_id))
+
+    new_langfuse_trace = langfuse_client.get_trace(id=trace_id)
+
+    assert dict(initial_langfuse_trace) == dict(new_langfuse_trace)
+
+
 def test_langfuse_logging_tool_calling():
    litellm.set_verbose = True

--- a/litellm/tests/test_alerting.py
+++ b/litellm/tests/test_alerting.py
@ -68,6 +68,7 @@ async def test_get_api_base():
    await _pl.alerting_handler(
        message=slow_message + request_info,
        level="Low",
+        alert_type="llm_too_slow",
    )
    print("passed test_get_api_base")

--- a/litellm/tests/test_amazing_vertex_completion.py
+++ b/litellm/tests/test_amazing_vertex_completion.py
@ -394,6 +394,8 @@ async def test_async_vertexai_response():
            pass
        except litellm.Timeout as e:
            pass
+        except litellm.APIError as e:
+            pass
        except Exception as e:
            pytest.fail(f"An exception occurred: {e}")

@ -636,7 +638,10 @@ def test_gemini_pro_function_calling():
 # gemini_pro_function_calling()


-def test_gemini_pro_function_calling_streaming():
+@pytest.mark.parametrize("stream", [False, True])
+@pytest.mark.parametrize("sync_mode", [False, True])
+@pytest.mark.asyncio
+async def test_gemini_pro_function_calling_streaming(stream, sync_mode):
    load_vertex_ai_credentials()
    litellm.set_verbose = True
    tools = [
@ -665,19 +670,41 @@ def test_gemini_pro_function_calling_streaming():
            "content": "What's the weather like in Boston today in fahrenheit?",
        }
    ]
+    optional_params = {
+        "tools": tools,
+        "tool_choice": "auto",
+        "n": 1,
+        "stream": stream,
+        "temperature": 0.1,
+    }
    try:
-        completion = litellm.completion(
-            model="gemini-pro",
-            messages=messages,
-            tools=tools,
-            tool_choice="auto",
-            stream=True,
+        if sync_mode == True:
+            response = litellm.completion(
+                model="gemini-pro", messages=messages, **optional_params
            )
-        print(f"completion: {completion}")
+            print(f"completion: {response}")
+
+            if stream == True:
                # assert completion.choices[0].message.content is None
                # assert len(completion.choices[0].message.tool_calls) == 1
-        for chunk in completion:
+                for chunk in response:
+                    assert isinstance(chunk, litellm.ModelResponse)
+            else:
+                assert isinstance(response, litellm.ModelResponse)
+        else:
+            response = await litellm.acompletion(
+                model="gemini-pro", messages=messages, **optional_params
+            )
+            print(f"completion: {response}")
+
+            if stream == True:
+                # assert completion.choices[0].message.content is None
+                # assert len(completion.choices[0].message.tool_calls) == 1
+                async for chunk in response:
                    print(f"chunk: {chunk}")
+                    assert isinstance(chunk, litellm.ModelResponse)
+            else:
+                assert isinstance(response, litellm.ModelResponse)
    except litellm.APIError as e:
        pass
    except litellm.RateLimitError as e:
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@ -57,7 +57,7 @@ def test_completion_custom_provider_model_name():
            messages=messages,
            logger_fn=logger_fn,
        )
-        # Add any assertions here to, check the response
+        # Add any assertions here to,check the response
        print(response)
        print(response["choices"][0]["finish_reason"])
    except litellm.Timeout as e:
@ -231,6 +231,76 @@ def test_completion_claude_3_function_call():
        pytest.fail(f"Error occurred: {e}")


+def test_completion_cohere_command_r_plus_function_call():
+    litellm.set_verbose = True
+    tools = [
+        {
+            "type": "function",
+            "function": {
+                "name": "get_current_weather",
+                "description": "Get the current weather in a given location",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "location": {
+                            "type": "string",
+                            "description": "The city and state, e.g. San Francisco, CA",
+                        },
+                        "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
+                    },
+                    "required": ["location"],
+                },
+            },
+        }
+    ]
+    messages = [
+        {
+            "role": "user",
+            "content": "What's the weather like in Boston today in Fahrenheit?",
+        }
+    ]
+    try:
+        # test without max tokens
+        response = completion(
+            model="command-r-plus",
+            messages=messages,
+            tools=tools,
+            tool_choice="auto",
+        )
+        # Add any assertions, here to check response args
+        print(response)
+        assert isinstance(response.choices[0].message.tool_calls[0].function.name, str)
+        assert isinstance(
+            response.choices[0].message.tool_calls[0].function.arguments, str
+        )
+
+        messages.append(
+            response.choices[0].message.model_dump()
+        )  # Add assistant tool invokes
+        tool_result = (
+            '{"location": "Boston", "temperature": "72", "unit": "fahrenheit"}'
+        )
+        # Add user submitted tool results in the OpenAI format
+        messages.append(
+            {
+                "tool_call_id": response.choices[0].message.tool_calls[0].id,
+                "role": "tool",
+                "name": response.choices[0].message.tool_calls[0].function.name,
+                "content": tool_result,
+            }
+        )
+        # In the second response, Cohere should deduce answer from tool results
+        second_response = completion(
+            model="command-r-plus",
+            messages=messages,
+            tools=tools,
+            tool_choice="auto",
+        )
+        print(second_response)
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+
+
 def test_parse_xml_params():
    from litellm.llms.prompt_templates.factory import parse_xml_params

@ -1291,6 +1361,7 @@ def test_completion_logprobs_stream():
        for chunk in response:
            # check if atleast one chunk has log probs
            print(chunk)
+            print(f"chunk.choices[0]: {chunk.choices[0]}")
            if "logprobs" in chunk.choices[0]:
                # assert we got a valid logprob in the choices
                assert len(chunk.choices[0].logprobs.content[0].top_logprobs) == 3
@ -1781,7 +1852,6 @@ def test_completion_replicate_llama3():
        print("RESPONSE STRING\n", response_str)
        if type(response_str) != str:
            pytest.fail(f"Error occurred: {e}")
-        raise Exception("it worked!")
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")

@ -2655,6 +2725,88 @@ def test_completion_palm_stream():
        pytest.fail(f"Error occurred: {e}")


+def test_completion_watsonx():
+    litellm.set_verbose = True
+    model_name = "watsonx/ibm/granite-13b-chat-v2"
+    try:
+        response = completion(
+            model=model_name,
+            messages=messages,
+            stop=["stop"],
+            max_tokens=20,
+        )
+        # Add any assertions here to check the response
+        print(response)
+    except litellm.APIError as e:
+        pass
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+
+
+@pytest.mark.parametrize(
+    "provider, model, project, region_name, token",
+    [
+        ("azure", "chatgpt-v-2", None, None, "test-token"),
+        ("vertex_ai", "anthropic-claude-3", "adroit-crow-1", "us-east1", None),
+        ("watsonx", "ibm/granite", "96946574", "dallas", "1234"),
+        ("bedrock", "anthropic.claude-3", None, "us-east-1", None),
+    ],
+)
+def test_unified_auth_params(provider, model, project, region_name, token):
+    """
+    Check if params = ["project", "region_name", "token"]
+    are correctly translated for = ["azure", "vertex_ai", "watsonx", "aws"]
+
+    tests get_optional_params
+    """
+    data = {
+        "project": project,
+        "region_name": region_name,
+        "token": token,
+        "custom_llm_provider": provider,
+        "model": model,
+    }
+
+    translated_optional_params = litellm.utils.get_optional_params(**data)
+
+    if provider == "azure":
+        special_auth_params = (
+            litellm.AzureOpenAIConfig().get_mapped_special_auth_params()
+        )
+    elif provider == "bedrock":
+        special_auth_params = (
+            litellm.AmazonBedrockGlobalConfig().get_mapped_special_auth_params()
+        )
+    elif provider == "vertex_ai":
+        special_auth_params = litellm.VertexAIConfig().get_mapped_special_auth_params()
+    elif provider == "watsonx":
+        special_auth_params = (
+            litellm.IBMWatsonXAIConfig().get_mapped_special_auth_params()
+        )
+
+    for param, value in special_auth_params.items():
+        assert param in data
+        assert value in translated_optional_params
+
+
+@pytest.mark.asyncio
+async def test_acompletion_watsonx():
+    litellm.set_verbose = True
+    model_name = "watsonx/ibm/granite-13b-chat-v2"
+    print("testing watsonx")
+    try:
+        response = await litellm.acompletion(
+            model=model_name,
+            messages=messages,
+            temperature=0.2,
+            max_tokens=80,
+        )
+        # Add any assertions here to check the response
+        print(response)
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+
+
 # test_completion_palm_stream()

 # test_completion_deep_infra()
--- a/litellm/tests/test_completion_cost.py
+++ b/litellm/tests/test_completion_cost.py
@ -328,3 +328,56 @@ def test_dalle_3_azure_cost_tracking():
        completion_response=response, call_type="image_generation"
    )
    assert cost > 0
+
+
+def test_replicate_llama3_cost_tracking():
+    litellm.set_verbose = True
+    model = "replicate/meta/meta-llama-3-8b-instruct"
+    litellm.register_model(
+        {
+            "replicate/meta/meta-llama-3-8b-instruct": {
+                "input_cost_per_token": 0.00000005,
+                "output_cost_per_token": 0.00000025,
+                "litellm_provider": "replicate",
+            }
+        }
+    )
+    response = litellm.ModelResponse(
+        id="chatcmpl-cad7282f-7f68-41e7-a5ab-9eb33ae301dc",
+        choices=[
+            litellm.utils.Choices(
+                finish_reason="stop",
+                index=0,
+                message=litellm.utils.Message(
+                    content="I'm doing well, thanks for asking! I'm here to help you with any questions or tasks you may have. How can I assist you today?",
+                    role="assistant",
+                ),
+            )
+        ],
+        created=1714401369,
+        model="replicate/meta/meta-llama-3-8b-instruct",
+        object="chat.completion",
+        system_fingerprint=None,
+        usage=litellm.utils.Usage(
+            prompt_tokens=48, completion_tokens=31, total_tokens=79
+        ),
+    )
+    cost = litellm.completion_cost(
+        completion_response=response,
+        messages=[{"role": "user", "content": "Hey, how's it going?"}],
+    )
+
+    print(f"cost: {cost}")
+    cost = round(cost, 5)
+    expected_cost = round(
+        litellm.model_cost["replicate/meta/meta-llama-3-8b-instruct"][
+            "input_cost_per_token"
+        ]
+        * 48
+        + litellm.model_cost["replicate/meta/meta-llama-3-8b-instruct"][
+            "output_cost_per_token"
+        ]
+        * 31,
+        5,
+    )
+    assert cost == expected_cost
--- a/litellm/tests/test_config.py
+++ b/litellm/tests/test_config.py
@ -26,6 +26,9 @@ class DBModel(BaseModel):
    model_info: dict
    litellm_params: dict

+    class Config:
+        protected_namespaces = ()
+

@pytest.mark.asyncio
 async def test_delete_deployment():
--- a/litellm/tests/test_custom_callback_input.py
+++ b/litellm/tests/test_custom_callback_input.py
@ -529,6 +529,7 @@ def test_chat_bedrock_stream():
@pytest.mark.asyncio
 async def test_async_chat_bedrock_stream():
    try:
+        litellm.set_verbose = True
        customHandler = CompletionCustomHandler()
        litellm.callbacks = [customHandler]
        response = await litellm.acompletion(
--- a/litellm/tests/test_embedding.py
+++ b/litellm/tests/test_embedding.py
@ -484,6 +484,20 @@ def test_mistral_embeddings():
        pytest.fail(f"Error occurred: {e}")


+@pytest.mark.skip(reason="local test")
+def test_watsonx_embeddings():
+    try:
+        litellm.set_verbose = True
+        response = litellm.embedding(
+            model="watsonx/ibm/slate-30m-english-rtrvr",
+            input=["good morning from litellm"],
+        )
+        print(f"response: {response}")
+        assert isinstance(response.usage, litellm.Usage)
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+
+
 # test_mistral_embeddings()


--- a/litellm/tests/test_function_setup.py
+++ b/litellm/tests/test_function_setup.py
@ -25,7 +25,7 @@ def test_empty_content():
        pass

    function_setup(
-        original_function=completion,
+        original_function="completion",
        rules_obj=rules_obj,
        start_time=datetime.now(),
        messages=[],
--- a/litellm/tests/test_image_generation.py
+++ b/litellm/tests/test_image_generation.py
@ -136,8 +136,8 @@ def test_image_generation_bedrock():
        litellm.set_verbose = True
        response = litellm.image_generation(
            prompt="A cute baby sea otter",
-            model="bedrock/stability.stable-diffusion-xl-v0",
-            aws_region_name="us-east-1",
+            model="bedrock/stability.stable-diffusion-xl-v1",
+            aws_region_name="us-west-2",
        )
        print(f"response: {response}")
    except litellm.RateLimitError as e:
@ -156,8 +156,8 @@ async def test_aimage_generation_bedrock_with_optional_params():
    try:
        response = await litellm.aimage_generation(
            prompt="A cute baby sea otter",
-            model="bedrock/stability.stable-diffusion-xl-v0",
-            size="128x128",
+            model="bedrock/stability.stable-diffusion-xl-v1",
+            size="256x256",
        )
        print(f"response: {response}")
    except litellm.RateLimitError as e:
--- a/litellm/tests/test_least_busy_routing.py
+++ b/litellm/tests/test_least_busy_routing.py
@ -201,6 +201,7 @@ async def test_router_atext_completion_streaming():

@pytest.mark.asyncio
 async def test_router_completion_streaming():
+    litellm.set_verbose = True
    messages = [
        {"role": "user", "content": "Hello, can you generate a 500 words poem?"}
    ]
@ -219,9 +220,9 @@ async def test_router_completion_streaming():
        {
            "model_name": "azure-model",
            "litellm_params": {
-                "model": "azure/gpt-35-turbo",
-                "api_key": "os.environ/AZURE_EUROPE_API_KEY",
-                "api_base": "https://my-endpoint-europe-berri-992.openai.azure.com",
+                "model": "azure/gpt-turbo",
+                "api_key": "os.environ/AZURE_FRANCE_API_KEY",
+                "api_base": "https://openai-france-1234.openai.azure.com",
                "rpm": 6,
            },
            "model_info": {"id": 2},
@ -229,9 +230,9 @@ async def test_router_completion_streaming():
        {
            "model_name": "azure-model",
            "litellm_params": {
-                "model": "azure/gpt-35-turbo",
-                "api_key": "os.environ/AZURE_CANADA_API_KEY",
-                "api_base": "https://my-endpoint-canada-berri992.openai.azure.com",
+                "model": "azure/gpt-turbo",
+                "api_key": "os.environ/AZURE_FRANCE_API_KEY",
+                "api_base": "https://openai-france-1234.openai.azure.com",
                "rpm": 6,
            },
            "model_info": {"id": 3},
@ -262,4 +263,4 @@ async def test_router_completion_streaming():
    ## check if calls equally distributed
    cache_dict = router.cache.get_cache(key=cache_key)
    for k, v in cache_dict.items():
-        assert v == 1
+        assert v == 1, f"Failed. K={k} called v={v} times, cache_dict={cache_dict}"
--- a/litellm/tests/test_lowest_latency_routing.py
+++ b/litellm/tests/test_lowest_latency_routing.py
@ -555,3 +555,171 @@ async def test_lowest_latency_routing_with_timeouts():

    # ALL the Requests should have been routed to the fast-endpoint
    assert deployments["fast-endpoint"] == 10
+
+
+@pytest.mark.asyncio
+async def test_lowest_latency_routing_first_pick():
+    """
+    PROD Test:
+    - When all deployments are latency=0, it should randomly pick a deployment
+    - IT SHOULD NEVER PICK THE Very First deployment everytime all deployment latencies are 0
+    - This ensures that after the ttl window resets it randomly picks a deployment
+    """
+    import litellm
+
+    litellm.set_verbose = True
+
+    router = Router(
+        model_list=[
+            {
+                "model_name": "azure-model",
+                "litellm_params": {
+                    "model": "openai/fast-endpoint",
+                    "api_base": "https://exampleopenaiendpoint-production.up.railway.app/",
+                    "api_key": "fake-key",
+                },
+                "model_info": {"id": "fast-endpoint"},
+            },
+            {
+                "model_name": "azure-model",
+                "litellm_params": {
+                    "model": "openai/fast-endpoint-2",
+                    "api_base": "https://exampleopenaiendpoint-production.up.railway.app/",
+                    "api_key": "fake-key",
+                },
+                "model_info": {"id": "fast-endpoint-2"},
+            },
+            {
+                "model_name": "azure-model",
+                "litellm_params": {
+                    "model": "openai/fast-endpoint-2",
+                    "api_base": "https://exampleopenaiendpoint-production.up.railway.app/",
+                    "api_key": "fake-key",
+                },
+                "model_info": {"id": "fast-endpoint-3"},
+            },
+            {
+                "model_name": "azure-model",
+                "litellm_params": {
+                    "model": "openai/fast-endpoint-2",
+                    "api_base": "https://exampleopenaiendpoint-production.up.railway.app/",
+                    "api_key": "fake-key",
+                },
+                "model_info": {"id": "fast-endpoint-4"},
+            },
+        ],
+        routing_strategy="latency-based-routing",
+        routing_strategy_args={"ttl": 0.0000000001},
+        set_verbose=True,
+        debug_level="DEBUG",
+    )  # type: ignore
+
+    deployments = {}
+    for _ in range(5):
+        response = await router.acompletion(
+            model="azure-model", messages=[{"role": "user", "content": "hello"}]
+        )
+        print(response)
+        _picked_model_id = response._hidden_params["model_id"]
+        if _picked_model_id not in deployments:
+            deployments[_picked_model_id] = 1
+        else:
+            deployments[_picked_model_id] += 1
+        await asyncio.sleep(0.000000000005)
+
+    print("deployments", deployments)
+
+    # assert that len(deployments) >1
+    assert len(deployments) > 1
+
+
+@pytest.mark.parametrize("buffer", [0, 1])
+@pytest.mark.asyncio
+async def test_lowest_latency_routing_buffer(buffer):
+    """
+    Allow shuffling calls within a certain latency buffer
+    """
+    model_list = [
+        {
+            "model_name": "azure-model",
+            "litellm_params": {
+                "model": "azure/gpt-turbo",
+                "api_key": "os.environ/AZURE_FRANCE_API_KEY",
+                "api_base": "https://openai-france-1234.openai.azure.com",
+                "rpm": 1440,
+            },
+            "model_info": {"id": 1},
+        },
+        {
+            "model_name": "azure-model",
+            "litellm_params": {
+                "model": "azure/gpt-35-turbo",
+                "api_key": "os.environ/AZURE_EUROPE_API_KEY",
+                "api_base": "https://my-endpoint-europe-berri-992.openai.azure.com",
+                "rpm": 6,
+            },
+            "model_info": {"id": 2},
+        },
+    ]
+    router = Router(
+        model_list=model_list,
+        routing_strategy="latency-based-routing",
+        set_verbose=False,
+        num_retries=3,
+        routing_strategy_args={"lowest_latency_buffer": buffer},
+    )  # type: ignore
+
+    ## DEPLOYMENT 1 ##
+    deployment_id = 1
+    kwargs = {
+        "litellm_params": {
+            "metadata": {
+                "model_group": "azure-model",
+            },
+            "model_info": {"id": 1},
+        }
+    }
+    start_time = time.time()
+    response_obj = {"usage": {"total_tokens": 50}}
+    time.sleep(3)
+    end_time = time.time()
+    router.lowestlatency_logger.log_success_event(
+        response_obj=response_obj,
+        kwargs=kwargs,
+        start_time=start_time,
+        end_time=end_time,
+    )
+    ## DEPLOYMENT 2 ##
+    deployment_id = 2
+    kwargs = {
+        "litellm_params": {
+            "metadata": {
+                "model_group": "azure-model",
+            },
+            "model_info": {"id": 2},
+        }
+    }
+    start_time = time.time()
+    response_obj = {"usage": {"total_tokens": 20}}
+    time.sleep(2)
+    end_time = time.time()
+    router.lowestlatency_logger.log_success_event(
+        response_obj=response_obj,
+        kwargs=kwargs,
+        start_time=start_time,
+        end_time=end_time,
+    )
+
+    ## CHECK WHAT'S SELECTED ##
+    # print(router.lowesttpm_logger.get_available_deployments(model_group="azure-model"))
+    selected_deployments = {}
+    for _ in range(50):
+        print(router.get_available_deployment(model="azure-model"))
+        selected_deployments[
+            router.get_available_deployment(model="azure-model")["model_info"]["id"]
+        ] = 1
+
+    if buffer == 0:
+        assert len(selected_deployments.keys()) == 1
+    else:
+        assert len(selected_deployments.keys()) == 2
--- a/litellm/tests/test_pydantic_namespaces.py
+++ b/litellm/tests/test_pydantic_namespaces.py
@ -0,0 +1,10 @@
+import warnings
+import pytest
+
+def test_namespace_conflict_warning():
+    with warnings.catch_warnings(record=True) as recorded_warnings:
+        warnings.simplefilter("always")  # Capture all warnings
+        import litellm
+
+    # Check that no warning with the specific message was raised
+    assert not any("conflict with protected namespace" in str(w.message) for w in recorded_warnings), "Test failed: 'conflict with protected namespace' warning was encountered!"
--- a/litellm/tests/test_router.py
+++ b/litellm/tests/test_router.py
@ -1,7 +1,7 @@
 #### What this tests ####
 # This tests litellm router

-import sys, os, time
+import sys, os, time, openai
 import traceback, asyncio
 import pytest

@ -14,10 +14,169 @@ from litellm.router import Deployment, LiteLLM_Params, ModelInfo
 from concurrent.futures import ThreadPoolExecutor
 from collections import defaultdict
 from dotenv import load_dotenv
+import os, httpx

 load_dotenv()


+@pytest.mark.parametrize("num_retries", [None, 2])
+@pytest.mark.parametrize("max_retries", [None, 4])
+def test_router_num_retries_init(num_retries, max_retries):
+    """
+    - test when num_retries set v/s not
+    - test client value when max retries set v/s not
+    """
+    router = Router(
+        model_list=[
+            {
+                "model_name": "gpt-3.5-turbo",  # openai model name
+                "litellm_params": {  # params for litellm completion/embedding call
+                    "model": "azure/chatgpt-v-2",
+                    "api_key": "bad-key",
+                    "api_version": os.getenv("AZURE_API_VERSION"),
+                    "api_base": os.getenv("AZURE_API_BASE"),
+                    "max_retries": max_retries,
+                },
+                "model_info": {"id": 12345},
+            },
+        ],
+        num_retries=num_retries,
+    )
+
+    if num_retries is not None:
+        assert router.num_retries == num_retries
+    else:
+        assert router.num_retries == openai.DEFAULT_MAX_RETRIES
+
+    model_client = router._get_client(
+        {"model_info": {"id": 12345}}, client_type="async", kwargs={}
+    )
+
+    if max_retries is not None:
+        assert getattr(model_client, "max_retries") == max_retries
+    else:
+        assert getattr(model_client, "max_retries") == 0
+
+
+@pytest.mark.parametrize(
+    "timeout", [10, 1.0, httpx.Timeout(timeout=300.0, connect=20.0)]
+)
+@pytest.mark.parametrize("ssl_verify", [True, False])
+def test_router_timeout_init(timeout, ssl_verify):
+    """
+    Allow user to pass httpx.Timeout
+
+    related issue - https://github.com/BerriAI/litellm/issues/3162
+    """
+    litellm.ssl_verify = ssl_verify
+
+    router = Router(
+        model_list=[
+            {
+                "model_name": "test-model",
+                "litellm_params": {
+                    "model": "azure/chatgpt-v-2",
+                    "api_key": os.getenv("AZURE_API_KEY"),
+                    "api_base": os.getenv("AZURE_API_BASE"),
+                    "api_version": os.getenv("AZURE_API_VERSION"),
+                    "timeout": timeout,
+                },
+                "model_info": {"id": 1234},
+            }
+        ]
+    )
+
+    model_client = router._get_client(
+        deployment={"model_info": {"id": 1234}}, client_type="sync_client", kwargs={}
+    )
+
+    assert getattr(model_client, "timeout") == timeout
+
+    print(f"vars model_client: {vars(model_client)}")
+    http_client = getattr(model_client, "_client")
+    print(f"http client: {vars(http_client)}, ssl_Verify={ssl_verify}")
+    if ssl_verify == False:
+        assert http_client._transport._pool._ssl_context.verify_mode.name == "CERT_NONE"
+    else:
+        assert (
+            http_client._transport._pool._ssl_context.verify_mode.name
+            == "CERT_REQUIRED"
+        )
+
+
+@pytest.mark.parametrize("sync_mode", [False, True])
+@pytest.mark.asyncio
+async def test_router_retries(sync_mode):
+    """
+    - make sure retries work as expected
+    """
+    model_list = [
+        {
+            "model_name": "gpt-3.5-turbo",
+            "litellm_params": {"model": "gpt-3.5-turbo", "api_key": "bad-key"},
+        },
+        {
+            "model_name": "gpt-3.5-turbo",
+            "litellm_params": {
+                "model": "azure/chatgpt-v-2",
+                "api_key": os.getenv("AZURE_API_KEY"),
+                "api_base": os.getenv("AZURE_API_BASE"),
+                "api_version": os.getenv("AZURE_API_VERSION"),
+            },
+        },
+    ]
+
+    router = Router(model_list=model_list, num_retries=2)
+
+    if sync_mode:
+        router.completion(
+            model="gpt-3.5-turbo",
+            messages=[{"role": "user", "content": "Hey, how's it going?"}],
+        )
+    else:
+        await router.acompletion(
+            model="gpt-3.5-turbo",
+            messages=[{"role": "user", "content": "Hey, how's it going?"}],
+        )
+
+
+@pytest.mark.parametrize(
+    "mistral_api_base",
+    [
+        "os.environ/AZURE_MISTRAL_API_BASE",
+        "https://Mistral-large-nmefg-serverless.eastus2.inference.ai.azure.com/v1/",
+        "https://Mistral-large-nmefg-serverless.eastus2.inference.ai.azure.com/v1",
+        "https://Mistral-large-nmefg-serverless.eastus2.inference.ai.azure.com/",
+        "https://Mistral-large-nmefg-serverless.eastus2.inference.ai.azure.com",
+    ],
+)
+def test_router_azure_ai_studio_init(mistral_api_base):
+    router = Router(
+        model_list=[
+            {
+                "model_name": "test-model",
+                "litellm_params": {
+                    "model": "azure/mistral-large-latest",
+                    "api_key": "os.environ/AZURE_MISTRAL_API_KEY",
+                    "api_base": mistral_api_base,
+                },
+                "model_info": {"id": 1234},
+            }
+        ]
+    )
+
+    model_client = router._get_client(
+        deployment={"model_info": {"id": 1234}}, client_type="sync_client", kwargs={}
+    )
+    url = getattr(model_client, "_base_url")
+    uri_reference = str(getattr(url, "_uri_reference"))
+
+    print(f"uri_reference: {uri_reference}")
+
+    assert "/v1/" in uri_reference
+    assert uri_reference.count("v1") == 1
+
+
 def test_exception_raising():
    # this tests if the router raises an exception when invalid params are set
    # in this test both deployments have bad keys - Keep this test. It validates if the router raises the most recent exception
@ -995,6 +1154,7 @@ def test_consistent_model_id():
    assert id1 == id2


+@pytest.mark.skip(reason="local test")
 def test_reading_keys_os_environ():
    import openai

@ -1094,6 +1254,7 @@ def test_reading_keys_os_environ():
 # test_reading_keys_os_environ()


+@pytest.mark.skip(reason="local test")
 def test_reading_openai_keys_os_environ():
    import openai

--- a/litellm/tests/test_router_debug_logs.py
+++ b/litellm/tests/test_router_debug_logs.py
@ -46,6 +46,7 @@ def test_async_fallbacks(caplog):
    router = Router(
        model_list=model_list,
        fallbacks=[{"gpt-3.5-turbo": ["azure/gpt-3.5-turbo"]}],
+        num_retries=1,
    )

    user_message = "Hello, how are you?"
@ -81,8 +82,8 @@ def test_async_fallbacks(caplog):
    # Define the expected log messages
    # - error request, falling back notice, success notice
    expected_logs = [
-        "Intialized router with Routing strategy: simple-shuffle\n\nRouting fallbacks: [{'gpt-3.5-turbo': ['azure/gpt-3.5-turbo']}]\n\nRouting context window fallbacks: None\n\nRouter Redis Caching=None",
        "litellm.acompletion(model=gpt-3.5-turbo)\x1b[31m Exception OpenAIException - Error code: 401 - {'error': {'message': 'Incorrect API key provided: bad-key. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}\x1b[0m",
+        "litellm.acompletion(model=None)\x1b[31m Exception No deployments available for selected model, passed model=gpt-3.5-turbo\x1b[0m",
        "Falling back to model_group = azure/gpt-3.5-turbo",
        "litellm.acompletion(model=azure/chatgpt-v-2)\x1b[32m 200 OK\x1b[0m",
    ]
--- a/litellm/tests/test_router_fallbacks.py
+++ b/litellm/tests/test_router_fallbacks.py
@ -22,10 +22,10 @@ class MyCustomHandler(CustomLogger):
    def log_pre_api_call(self, model, messages, kwargs):
        print(f"Pre-API Call")
        print(
-            f"previous_models: {kwargs['litellm_params']['metadata']['previous_models']}"
+            f"previous_models: {kwargs['litellm_params']['metadata'].get('previous_models', None)}"
        )
-        self.previous_models += len(
-            kwargs["litellm_params"]["metadata"]["previous_models"]
+        self.previous_models = len(
+            kwargs["litellm_params"]["metadata"].get("previous_models", [])
        )  # {"previous_models": [{"model": litellm_model_name, "exception_type": AuthenticationError, "exception_string": <complete_traceback>}]}
        print(f"self.previous_models: {self.previous_models}")

@ -127,7 +127,7 @@ def test_sync_fallbacks():
        response = router.completion(**kwargs)
        print(f"response: {response}")
        time.sleep(0.05)  # allow a delay as success_callbacks are on a separate thread
-        assert customHandler.previous_models == 1  # 0 retries, 1 fallback
+        assert customHandler.previous_models == 4

        print("Passed ! Test router_fallbacks: test_sync_fallbacks()")
        router.reset()
@ -140,7 +140,7 @@ def test_sync_fallbacks():

@pytest.mark.asyncio
 async def test_async_fallbacks():
-    litellm.set_verbose = False
+    litellm.set_verbose = True
    model_list = [
        {  # list of model deployments
            "model_name": "azure/gpt-3.5-turbo",  # openai model name
@ -209,12 +209,13 @@ async def test_async_fallbacks():
    user_message = "Hello, how are you?"
    messages = [{"content": user_message, "role": "user"}]
    try:
+        kwargs["model"] = "azure/gpt-3.5-turbo"
        response = await router.acompletion(**kwargs)
        print(f"customHandler.previous_models: {customHandler.previous_models}")
        await asyncio.sleep(
            0.05
        )  # allow a delay as success_callbacks are on a separate thread
-        assert customHandler.previous_models == 1  # 0 retries, 1 fallback
+        assert customHandler.previous_models == 4  # 1 init call, 2 retries, 1 fallback
        router.reset()
    except litellm.Timeout as e:
        pass
@ -268,7 +269,7 @@ def test_sync_fallbacks_embeddings():
        response = router.embedding(**kwargs)
        print(f"customHandler.previous_models: {customHandler.previous_models}")
        time.sleep(0.05)  # allow a delay as success_callbacks are on a separate thread
-        assert customHandler.previous_models == 1  # 0 retries, 1 fallback
+        assert customHandler.previous_models == 4  # 1 init call, 2 retries, 1 fallback
        router.reset()
    except litellm.Timeout as e:
        pass
@ -322,7 +323,7 @@ async def test_async_fallbacks_embeddings():
        await asyncio.sleep(
            0.05
        )  # allow a delay as success_callbacks are on a separate thread
-        assert customHandler.previous_models == 1  # 0 retries, 1 fallback
+        assert customHandler.previous_models == 4  # 1 init call, 2 retries, 1 fallback
        router.reset()
    except litellm.Timeout as e:
        pass
@ -401,7 +402,7 @@ def test_dynamic_fallbacks_sync():
        response = router.completion(**kwargs)
        print(f"response: {response}")
        time.sleep(0.05)  # allow a delay as success_callbacks are on a separate thread
-        assert customHandler.previous_models == 1  # 0 retries, 1 fallback
+        assert customHandler.previous_models == 4  # 1 init call, 2 retries, 1 fallback
        router.reset()
    except Exception as e:
        pytest.fail(f"An exception occurred - {e}")
@ -487,7 +488,7 @@ async def test_dynamic_fallbacks_async():
        await asyncio.sleep(
            0.05
        )  # allow a delay as success_callbacks are on a separate thread
-        assert customHandler.previous_models == 1  # 0 retries, 1 fallback
+        assert customHandler.previous_models == 4  # 1 init call, 2 retries, 1 fallback
        router.reset()
    except Exception as e:
        pytest.fail(f"An exception occurred - {e}")
@ -572,7 +573,7 @@ async def test_async_fallbacks_streaming():
        await asyncio.sleep(
            0.05
        )  # allow a delay as success_callbacks are on a separate thread
-        assert customHandler.previous_models == 1  # 0 retries, 1 fallback
+        assert customHandler.previous_models == 4  # 1 init call, 2 retries, 1 fallback
        router.reset()
    except litellm.Timeout as e:
        pass
@ -751,7 +752,7 @@ async def test_async_fallbacks_max_retries_per_request():
        router.reset()


-def test_usage_based_routing_fallbacks():
+def test_ausage_based_routing_fallbacks():
    try:
        # [Prod Test]
        # IT tests Usage Based Routing with fallbacks
@ -765,10 +766,10 @@ def test_usage_based_routing_fallbacks():
        load_dotenv()

        # Constants for TPM and RPM allocation
-        AZURE_FAST_TPM = 3
-        AZURE_BASIC_TPM = 4
-        OPENAI_TPM = 400
-        ANTHROPIC_TPM = 100000
+        AZURE_FAST_RPM = 1
+        AZURE_BASIC_RPM = 1
+        OPENAI_RPM = 2
+        ANTHROPIC_RPM = 100000

        def get_azure_params(deployment_name: str):
            params = {
@ -797,22 +798,26 @@ def test_usage_based_routing_fallbacks():
            {
                "model_name": "azure/gpt-4-fast",
                "litellm_params": get_azure_params("chatgpt-v-2"),
-                "tpm": AZURE_FAST_TPM,
+                "model_info": {"id": 1},
+                "rpm": AZURE_FAST_RPM,
            },
            {
                "model_name": "azure/gpt-4-basic",
                "litellm_params": get_azure_params("chatgpt-v-2"),
-                "tpm": AZURE_BASIC_TPM,
+                "model_info": {"id": 2},
+                "rpm": AZURE_BASIC_RPM,
            },
            {
                "model_name": "openai-gpt-4",
                "litellm_params": get_openai_params("gpt-3.5-turbo"),
-                "tpm": OPENAI_TPM,
+                "model_info": {"id": 3},
+                "rpm": OPENAI_RPM,
            },
            {
                "model_name": "anthropic-claude-instant-1.2",
                "litellm_params": get_anthropic_params("claude-instant-1.2"),
-                "tpm": ANTHROPIC_TPM,
+                "model_info": {"id": 4},
+                "rpm": ANTHROPIC_RPM,
            },
        ]
        # litellm.set_verbose=True
@ -830,6 +835,7 @@ def test_usage_based_routing_fallbacks():
            routing_strategy="usage-based-routing",
            redis_host=os.environ["REDIS_HOST"],
            redis_port=os.environ["REDIS_PORT"],
+            num_retries=0,
        )

        messages = [
@ -842,10 +848,10 @@ def test_usage_based_routing_fallbacks():
            mock_response="very nice to meet you",
        )
        print("response: ", response)
-        print("response._hidden_params: ", response._hidden_params)
+        print(f"response._hidden_params: {response._hidden_params}")
        # in this test, we expect azure/gpt-4 fast to fail, then azure-gpt-4 basic to fail and then openai-gpt-4 to pass
        # the token count of this message is > AZURE_FAST_TPM, > AZURE_BASIC_TPM
-        assert response._hidden_params["custom_llm_provider"] == "openai"
+        assert response._hidden_params["model_id"] == "1"

        # now make 100 mock requests to OpenAI - expect it to fallback to anthropic-claude-instant-1.2
        for i in range(20):
@ -859,7 +865,7 @@ def test_usage_based_routing_fallbacks():
            print("response._hidden_params: ", response._hidden_params)
            if i == 19:
                # by the 19th call we should have hit TPM LIMIT for OpenAI, it should fallback to anthropic-claude-instant-1.2
-                assert response._hidden_params["custom_llm_provider"] == "anthropic"
+                assert response._hidden_params["model_id"] == "4"

    except Exception as e:
        pytest.fail(f"An exception occurred {e}")
--- a/litellm/tests/test_router_init.py
+++ b/litellm/tests/test_router_init.py
@ -203,7 +203,7 @@ def test_timeouts_router():
                },
            },
        ]
-        router = Router(model_list=model_list)
+        router = Router(model_list=model_list, num_retries=0)

        print("PASSED !")

@ -396,7 +396,9 @@ def test_router_init_gpt_4_vision_enhancements():
        pytest.fail(f"Error occurred: {e}")


-def test_openai_with_organization():
+@pytest.mark.parametrize("sync_mode", [True, False])
+@pytest.mark.asyncio
+async def test_openai_with_organization(sync_mode):
    try:
        print("Testing OpenAI with organization")
        model_list = [
@ -418,6 +420,7 @@ def test_openai_with_organization():
        print(router.model_list)
        print(router.model_list[0])

+        if sync_mode:
            openai_client = router._get_client(
                deployment=router.model_list[0],
                kwargs={"input": ["hello"], "model": "openai-bad-org"},
@ -433,7 +436,9 @@ def test_openai_with_organization():
                    model="openai-bad-org",
                    messages=[{"role": "user", "content": "this is a test"}],
                )
-            pytest.fail("Request should have failed - This organization does not exist")
+                pytest.fail(
+                    "Request should have failed - This organization does not exist"
+                )
            except Exception as e:
                print("Got exception: " + str(e))
                assert "No such organization: org-ikDc4ex8NB" in str(e)
@ -444,6 +449,36 @@ def test_openai_with_organization():
                messages=[{"role": "user", "content": "this is a test"}],
                max_tokens=5,
            )
+        else:
+            openai_client = router._get_client(
+                deployment=router.model_list[0],
+                kwargs={"input": ["hello"], "model": "openai-bad-org"},
+                client_type="async",
+            )
+            print(vars(openai_client))
+
+            assert openai_client.organization == "org-ikDc4ex8NB"
+
+            # bad org raises error
+
+            try:
+                response = await router.acompletion(
+                    model="openai-bad-org",
+                    messages=[{"role": "user", "content": "this is a test"}],
+                )
+                pytest.fail(
+                    "Request should have failed - This organization does not exist"
+                )
+            except Exception as e:
+                print("Got exception: " + str(e))
+                assert "No such organization: org-ikDc4ex8NB" in str(e)
+
+            # good org works
+            response = await router.acompletion(
+                model="openai-good-org",
+                messages=[{"role": "user", "content": "this is a test"}],
+                max_tokens=5,
+            )

    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
--- a/litellm/tests/test_router_retries.py
+++ b/litellm/tests/test_router_retries.py
@ -0,0 +1,121 @@
+#### What this tests ####
+#    This tests calling router with fallback models
+
+import sys, os, time
+import traceback, asyncio
+import pytest
+
+sys.path.insert(
+    0, os.path.abspath("../..")
+)  # Adds the parent directory to the system path
+
+import litellm
+from litellm import Router
+from litellm.integrations.custom_logger import CustomLogger
+
+
+class MyCustomHandler(CustomLogger):
+    success: bool = False
+    failure: bool = False
+    previous_models: int = 0
+
+    def log_pre_api_call(self, model, messages, kwargs):
+        print(f"Pre-API Call")
+        print(
+            f"previous_models: {kwargs['litellm_params']['metadata'].get('previous_models', None)}"
+        )
+        self.previous_models = len(
+            kwargs["litellm_params"]["metadata"].get("previous_models", [])
+        )  # {"previous_models": [{"model": litellm_model_name, "exception_type": AuthenticationError, "exception_string": <complete_traceback>}]}
+        print(f"self.previous_models: {self.previous_models}")
+
+    def log_post_api_call(self, kwargs, response_obj, start_time, end_time):
+        print(
+            f"Post-API Call - response object: {response_obj}; model: {kwargs['model']}"
+        )
+
+    def log_stream_event(self, kwargs, response_obj, start_time, end_time):
+        print(f"On Stream")
+
+    def async_log_stream_event(self, kwargs, response_obj, start_time, end_time):
+        print(f"On Stream")
+
+    def log_success_event(self, kwargs, response_obj, start_time, end_time):
+        print(f"On Success")
+
+    async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
+        print(f"On Success")
+
+    def log_failure_event(self, kwargs, response_obj, start_time, end_time):
+        print(f"On Failure")
+
+
+"""
+Test sync + async 
+
+- Authorization Errors 
+- Random API Error 
+"""
+
+
+@pytest.mark.parametrize("sync_mode", [True, False])
+@pytest.mark.parametrize("error_type", ["Authorization Error", "API Error"])
+@pytest.mark.asyncio
+async def test_router_retries_errors(sync_mode, error_type):
+    """
+    - Auth Error -> 0 retries
+    - API Error -> 2 retries
+    """
+
+    _api_key = (
+        "bad-key" if error_type == "Authorization Error" else os.getenv("AZURE_API_KEY")
+    )
+    print(f"_api_key: {_api_key}")
+    model_list = [
+        {
+            "model_name": "azure/gpt-3.5-turbo",  # openai model name
+            "litellm_params": {  # params for litellm completion/embedding call
+                "model": "azure/chatgpt-functioncalling",
+                "api_key": _api_key,
+                "api_version": os.getenv("AZURE_API_VERSION"),
+                "api_base": os.getenv("AZURE_API_BASE"),
+            },
+            "tpm": 240000,
+            "rpm": 1800,
+        },
+    ]
+
+    router = Router(model_list=model_list, allowed_fails=3)
+
+    customHandler = MyCustomHandler()
+    litellm.callbacks = [customHandler]
+    user_message = "Hello, how are you?"
+    messages = [{"content": user_message, "role": "user"}]
+
+    kwargs = {
+        "model": "azure/gpt-3.5-turbo",
+        "messages": messages,
+        "mock_response": (
+            None
+            if error_type == "Authorization Error"
+            else Exception("Invalid Request")
+        ),
+    }
+
+    try:
+        if sync_mode:
+            response = router.completion(**kwargs)
+        else:
+            response = await router.acompletion(**kwargs)
+    except Exception as e:
+        pass
+
+    await asyncio.sleep(
+        0.05
+    )  # allow a delay as success_callbacks are on a separate thread
+    print(f"customHandler.previous_models: {customHandler.previous_models}")
+
+    if error_type == "Authorization Error":
+        assert customHandler.previous_models == 0  # 0 retries
+    else:
+        assert customHandler.previous_models == 2  # 2 retries
--- a/litellm/tests/test_router_timeout.py
+++ b/litellm/tests/test_router_timeout.py
@ -57,6 +57,7 @@ def test_router_timeouts():
        redis_password=os.getenv("REDIS_PASSWORD"),
        redis_port=int(os.getenv("REDIS_PORT")),
        timeout=10,
+        num_retries=0,
    )

    print("***** TPM SETTINGS *****")
@ -89,15 +90,15 @@ def test_router_timeouts():

@pytest.mark.asyncio
 async def test_router_timeouts_bedrock():
-    import openai
+    import openai, uuid

    # Model list for OpenAI and Anthropic models
-    model_list = [
+    _model_list = [
        {
            "model_name": "bedrock",
            "litellm_params": {
                "model": "bedrock/anthropic.claude-instant-v1",
-                "timeout": 0.001,
+                "timeout": 0.00001,
            },
            "tpm": 80000,
        },
@ -105,17 +106,18 @@ async def test_router_timeouts_bedrock():

    # Configure router
    router = Router(
-        model_list=model_list,
+        model_list=_model_list,
        routing_strategy="usage-based-routing",
        debug_level="DEBUG",
        set_verbose=True,
+        num_retries=0,
    )

    litellm.set_verbose = True
    try:
        response = await router.acompletion(
            model="bedrock",
-            messages=[{"role": "user", "content": "hello, who are u"}],
+            messages=[{"role": "user", "content": f"hello, who are u {uuid.uuid4()}"}],
        )
        print(response)
        pytest.fail("Did not raise error `openai.APITimeoutError`")
--- a/litellm/tests/test_streaming.py
+++ b/litellm/tests/test_streaming.py
@ -518,7 +518,7 @@ async def test_acompletion_gemini_stream():
        litellm.set_verbose = True
        print("Streaming gemini response")
        messages = [
-            {"role": "system", "content": "You are a helpful assistant."},
+            # {"role": "system", "content": "You are a helpful assistant."},
            {
                "role": "user",
                "content": "What do you know?",
@ -1271,6 +1271,33 @@ def test_completion_sagemaker_stream():
        pytest.fail(f"Error occurred: {e}")


+def test_completion_watsonx_stream():
+    litellm.set_verbose = True
+    try:
+        response = completion(
+            model="watsonx/ibm/granite-13b-chat-v2",
+            messages=messages,
+            temperature=0.5,
+            max_tokens=20,
+            stream=True,
+        )
+        complete_response = ""
+        has_finish_reason = False
+        # Add any assertions here to check the response
+        for idx, chunk in enumerate(response):
+            chunk, finished = streaming_format_tests(idx, chunk)
+            has_finish_reason = finished
+            if finished:
+                break
+            complete_response += chunk
+        if has_finish_reason is False:
+            raise Exception("finish reason not set for last chunk")
+        if complete_response.strip() == "":
+            raise Exception("Empty response received")
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+
+
 # test_completion_sagemaker_stream()


@ -2446,6 +2473,34 @@ class ModelResponseIterator:
        return self.model_response


+class ModelResponseListIterator:
+    def __init__(self, model_responses):
+        self.model_responses = model_responses
+        self.index = 0
+
+    # Sync iterator
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        if self.index >= len(self.model_responses):
+            raise StopIteration
+        model_response = self.model_responses[self.index]
+        self.index += 1
+        return model_response
+
+    # Async iterator
+    def __aiter__(self):
+        return self
+
+    async def __anext__(self):
+        if self.index >= len(self.model_responses):
+            raise StopAsyncIteration
+        model_response = self.model_responses[self.index]
+        self.index += 1
+        return model_response
+
+
 def test_unit_test_custom_stream_wrapper():
    """
    Test if last streaming chunk ends with '?', if the message repeats itself.
@ -2486,3 +2541,268 @@ def test_unit_test_custom_stream_wrapper():
            if "How are you?" in chunk.choices[0].delta.content:
                freq += 1
    assert freq == 1
+
+
+def test_aamazing_unit_test_custom_stream_wrapper_n():
+    """
+    Test if the translated output maps exactly to the received openai input
+
+    Relevant issue: https://github.com/BerriAI/litellm/issues/3276
+    """
+    chunks = [
+        {
+            "id": "chatcmpl-9HzZIMCtVq7CbTmdwEZrktiTeoiYe",
+            "object": "chat.completion.chunk",
+            "created": 1714075272,
+            "model": "gpt-4-0613",
+            "system_fingerprint": None,
+            "choices": [
+                {
+                    "index": 0,
+                    "delta": {"content": "It"},
+                    "logprobs": {
+                        "content": [
+                            {
+                                "token": "It",
+                                "logprob": -1.5952516,
+                                "bytes": [73, 116],
+                                "top_logprobs": [
+                                    {
+                                        "token": "Brown",
+                                        "logprob": -0.7358765,
+                                        "bytes": [66, 114, 111, 119, 110],
+                                    }
+                                ],
+                            }
+                        ]
+                    },
+                    "finish_reason": None,
+                }
+            ],
+        },
+        {
+            "id": "chatcmpl-9HzZIMCtVq7CbTmdwEZrktiTeoiYe",
+            "object": "chat.completion.chunk",
+            "created": 1714075272,
+            "model": "gpt-4-0613",
+            "system_fingerprint": None,
+            "choices": [
+                {
+                    "index": 1,
+                    "delta": {"content": "Brown"},
+                    "logprobs": {
+                        "content": [
+                            {
+                                "token": "Brown",
+                                "logprob": -0.7358765,
+                                "bytes": [66, 114, 111, 119, 110],
+                                "top_logprobs": [
+                                    {
+                                        "token": "Brown",
+                                        "logprob": -0.7358765,
+                                        "bytes": [66, 114, 111, 119, 110],
+                                    }
+                                ],
+                            }
+                        ]
+                    },
+                    "finish_reason": None,
+                }
+            ],
+        },
+        {
+            "id": "chatcmpl-9HzZIMCtVq7CbTmdwEZrktiTeoiYe",
+            "object": "chat.completion.chunk",
+            "created": 1714075272,
+            "model": "gpt-4-0613",
+            "system_fingerprint": None,
+            "choices": [
+                {
+                    "index": 0,
+                    "delta": {"content": "'s"},
+                    "logprobs": {
+                        "content": [
+                            {
+                                "token": "'s",
+                                "logprob": -0.006786893,
+                                "bytes": [39, 115],
+                                "top_logprobs": [
+                                    {
+                                        "token": "'s",
+                                        "logprob": -0.006786893,
+                                        "bytes": [39, 115],
+                                    }
+                                ],
+                            }
+                        ]
+                    },
+                    "finish_reason": None,
+                }
+            ],
+        },
+        {
+            "id": "chatcmpl-9HzZIMCtVq7CbTmdwEZrktiTeoiYe",
+            "object": "chat.completion.chunk",
+            "created": 1714075272,
+            "model": "gpt-4-0613",
+            "system_fingerprint": None,
+            "choices": [
+                {
+                    "index": 0,
+                    "delta": {"content": " impossible"},
+                    "logprobs": {
+                        "content": [
+                            {
+                                "token": " impossible",
+                                "logprob": -0.06528423,
+                                "bytes": [
+                                    32,
+                                    105,
+                                    109,
+                                    112,
+                                    111,
+                                    115,
+                                    115,
+                                    105,
+                                    98,
+                                    108,
+                                    101,
+                                ],
+                                "top_logprobs": [
+                                    {
+                                        "token": " impossible",
+                                        "logprob": -0.06528423,
+                                        "bytes": [
+                                            32,
+                                            105,
+                                            109,
+                                            112,
+                                            111,
+                                            115,
+                                            115,
+                                            105,
+                                            98,
+                                            108,
+                                            101,
+                                        ],
+                                    }
+                                ],
+                            }
+                        ]
+                    },
+                    "finish_reason": None,
+                }
+            ],
+        },
+        {
+            "id": "chatcmpl-9HzZIMCtVq7CbTmdwEZrktiTeoiYe",
+            "object": "chat.completion.chunk",
+            "created": 1714075272,
+            "model": "gpt-4-0613",
+            "system_fingerprint": None,
+            "choices": [
+                {
+                    "index": 0,
+                    "delta": {"content": "—even"},
+                    "logprobs": {
+                        "content": [
+                            {
+                                "token": "—even",
+                                "logprob": -9999.0,
+                                "bytes": [226, 128, 148, 101, 118, 101, 110],
+                                "top_logprobs": [
+                                    {
+                                        "token": " to",
+                                        "logprob": -0.12302828,
+                                        "bytes": [32, 116, 111],
+                                    }
+                                ],
+                            }
+                        ]
+                    },
+                    "finish_reason": None,
+                }
+            ],
+        },
+        {
+            "id": "chatcmpl-9HzZIMCtVq7CbTmdwEZrktiTeoiYe",
+            "object": "chat.completion.chunk",
+            "created": 1714075272,
+            "model": "gpt-4-0613",
+            "system_fingerprint": None,
+            "choices": [
+                {"index": 0, "delta": {}, "logprobs": None, "finish_reason": "length"}
+            ],
+        },
+        {
+            "id": "chatcmpl-9HzZIMCtVq7CbTmdwEZrktiTeoiYe",
+            "object": "chat.completion.chunk",
+            "created": 1714075272,
+            "model": "gpt-4-0613",
+            "system_fingerprint": None,
+            "choices": [
+                {"index": 1, "delta": {}, "logprobs": None, "finish_reason": "stop"}
+            ],
+        },
+    ]
+
+    litellm.set_verbose = True
+
+    chunk_list = []
+    for chunk in chunks:
+        new_chunk = litellm.ModelResponse(stream=True, id=chunk["id"])
+        if "choices" in chunk and isinstance(chunk["choices"], list):
+            print("INSIDE CHUNK CHOICES!")
+            new_choices = []
+            for choice in chunk["choices"]:
+                if isinstance(choice, litellm.utils.StreamingChoices):
+                    _new_choice = choice
+                elif isinstance(choice, dict):
+                    _new_choice = litellm.utils.StreamingChoices(**choice)
+                new_choices.append(_new_choice)
+            new_chunk.choices = new_choices
+        chunk_list.append(new_chunk)
+
+    completion_stream = ModelResponseListIterator(model_responses=chunk_list)
+
+    response = litellm.CustomStreamWrapper(
+        completion_stream=completion_stream,
+        model="gpt-4-0613",
+        custom_llm_provider="cached_response",
+        logging_obj=litellm.Logging(
+            model="gpt-4-0613",
+            messages=[{"role": "user", "content": "Hey"}],
+            stream=True,
+            call_type="completion",
+            start_time=time.time(),
+            litellm_call_id="12345",
+            function_id="1245",
+        ),
+    )
+
+    for idx, chunk in enumerate(response):
+        chunk_dict = {}
+        try:
+            chunk_dict = chunk.model_dump(exclude_none=True)
+        except:
+            chunk_dict = chunk.dict(exclude_none=True)
+
+        chunk_dict.pop("created")
+        chunks[idx].pop("created")
+        if chunks[idx]["system_fingerprint"] is None:
+            chunks[idx].pop("system_fingerprint", None)
+        if idx == 0:
+            for choice in chunk_dict["choices"]:
+                if "role" in choice["delta"]:
+                    choice["delta"].pop("role")
+
+        for choice in chunks[idx]["choices"]:
+            # ignore finish reason None - since our pydantic object is set to exclude_none = true
+            if "finish_reason" in choice and choice["finish_reason"] is None:
+                choice.pop("finish_reason")
+            if "logprobs" in choice and choice["logprobs"] is None:
+                choice.pop("logprobs")
+
+        assert (
+            chunk_dict == chunks[idx]
+        ), f"idx={idx} translated chunk = {chunk_dict} != openai chunk = {chunks[idx]}"
--- a/litellm/tests/test_timeout.py
+++ b/litellm/tests/test_timeout.py
@ -78,7 +78,8 @@ def test_hanging_request_azure():
                    "model_name": "openai-gpt",
                    "litellm_params": {"model": "gpt-3.5-turbo"},
                },
-            ]
+            ],
+            num_retries=0,
        )

        encoded = litellm.utils.encode(model="gpt-3.5-turbo", text="blue")[0]
@ -131,7 +132,8 @@ def test_hanging_request_openai():
                    "model_name": "openai-gpt",
                    "litellm_params": {"model": "gpt-3.5-turbo"},
                },
-            ]
+            ],
+            num_retries=0,
        )

        encoded = litellm.utils.encode(model="gpt-3.5-turbo", text="blue")[0]
@ -189,6 +191,7 @@ def test_timeout_streaming():
 # test_timeout_streaming()


+@pytest.mark.skip(reason="local test")
 def test_timeout_ollama():
    # this Will Raise a timeout
    import litellm
--- a/litellm/tests/test_tpm_rpm_routing_v2.py
+++ b/litellm/tests/test_tpm_rpm_routing_v2.py
@ -282,6 +282,64 @@ def test_router_skip_rate_limited_deployments():
        print(f"An exception occurred! {str(e)}")


+@pytest.mark.parametrize("sync_mode", [True, False])
+@pytest.mark.asyncio
+async def test_multiple_potential_deployments(sync_mode):
+    """
+    If multiple deployments have the same tpm value
+
+    call 5 times, test if deployments are shuffled.
+
+    -> prevents single deployment from being overloaded in high-concurrency scenario
+    """
+
+    model_list = [
+        {
+            "model_name": "azure-model",
+            "litellm_params": {
+                "model": "azure/gpt-turbo",
+                "api_key": "os.environ/AZURE_FRANCE_API_KEY",
+                "api_base": "https://openai-france-1234.openai.azure.com",
+                "tpm": 1440,
+            },
+        },
+        {
+            "model_name": "azure-model",
+            "litellm_params": {
+                "model": "azure/gpt-turbo-2",
+                "api_key": "os.environ/AZURE_FRANCE_API_KEY",
+                "api_base": "https://openai-france-1234.openai.azure.com",
+                "tpm": 1440,
+            },
+        },
+    ]
+    router = Router(
+        model_list=model_list,
+        routing_strategy="usage-based-routing-v2",
+        set_verbose=False,
+        num_retries=3,
+    )  # type: ignore
+
+    model_ids = set()
+    for _ in range(1000):
+        if sync_mode:
+            deployment = router.get_available_deployment(
+                model="azure-model",
+                messages=[{"role": "user", "content": "Hey, how's it going?"}],
+            )
+        else:
+            deployment = await router.async_get_available_deployment(
+                model="azure-model",
+                messages=[{"role": "user", "content": "Hey, how's it going?"}],
+            )
+
+        ## get id ##
+        id = deployment.get("model_info", {}).get("id")
+        model_ids.add(id)
+
+    assert len(model_ids) == 2
+
+
 def test_single_deployment_tpm_zero():
    import litellm
    import os
--- a/litellm/types/router.py
+++ b/litellm/types/router.py
@ -1,5 +1,5 @@
 from typing import List, Optional, Union, Dict, Tuple, Literal
-
+import httpx
 from pydantic import BaseModel, validator
 from .completion import CompletionRequest
 from .embedding import EmbeddingRequest
@ -104,11 +104,13 @@ class LiteLLM_Params(BaseModel):
    api_key: Optional[str] = None
    api_base: Optional[str] = None
    api_version: Optional[str] = None
-    timeout: Optional[Union[float, str]] = None  # if str, pass in as os.environ/
+    timeout: Optional[Union[float, str, httpx.Timeout]] = (
+        None  # if str, pass in as os.environ/
+    )
    stream_timeout: Optional[Union[float, str]] = (
        None  # timeout when making stream=True calls, if str, pass in as os.environ/
    )
-    max_retries: int = 2  # follows openai default of 2
+    max_retries: Optional[int] = None
    organization: Optional[str] = None  # for openai orgs
    ## VERTEX AI ##
    vertex_project: Optional[str] = None
@ -146,14 +148,13 @@ class LiteLLM_Params(BaseModel):
        args.pop("self", None)
        args.pop("params", None)
        args.pop("__class__", None)
-        if max_retries is None:
-            max_retries = 2
-        elif isinstance(max_retries, str):
+        if max_retries is not None and isinstance(max_retries, str):
            max_retries = int(max_retries)  # cast to int
        super().__init__(max_retries=max_retries, **args, **params)

    class Config:
        extra = "allow"
+        arbitrary_types_allowed = True

    def __contains__(self, key):
        # Define custom behavior for the 'in' operator
@ -201,6 +202,9 @@ class updateDeployment(BaseModel):
    litellm_params: Optional[updateLiteLLMParams] = None
    model_info: Optional[ModelInfo] = None

+    class Config:
+        protected_namespaces = ()
+

 class Deployment(BaseModel):
    model_name: str
@ -259,3 +263,4 @@ class RouterErrors(enum.Enum):
    """

    user_defined_ratelimit_error = "Deployment over user-defined ratelimit."
+    no_deployments_available = "No deployments available for selected model"
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -19,6 +19,7 @@ from functools import wraps
 import datetime, time
 import tiktoken
 import uuid
+from pydantic import BaseModel
 import aiohttp
 import textwrap
 import logging
@ -69,6 +70,7 @@ from .integrations.langsmith import LangsmithLogger
 from .integrations.weights_biases import WeightsBiasesLogger
 from .integrations.custom_logger import CustomLogger
 from .integrations.langfuse import LangFuseLogger
+from .integrations.openmeter import OpenMeterLogger
 from .integrations.datadog import DataDogLogger
 from .integrations.prometheus import PrometheusLogger
 from .integrations.prometheus_services import PrometheusServicesLogger
@ -105,7 +107,7 @@ try:
 except Exception as e:
    verbose_logger.debug(f"Exception import enterprise features {str(e)}")

-from typing import cast, List, Dict, Union, Optional, Literal, Any, BinaryIO
+from typing import cast, List, Dict, Union, Optional, Literal, Any, BinaryIO, Iterable
 from .caching import Cache
 from concurrent.futures import ThreadPoolExecutor

@ -129,6 +131,7 @@ langsmithLogger = None
 weightsBiasesLogger = None
 customLogger = None
 langFuseLogger = None
+openMeterLogger = None
 dataDogLogger = None
 prometheusLogger = None
 dynamoLogger = None
@ -219,6 +222,61 @@ def map_finish_reason(
    return finish_reason


+class TopLogprob(OpenAIObject):
+    token: str
+    """The token."""
+
+    bytes: Optional[List[int]] = None
+    """A list of integers representing the UTF-8 bytes representation of the token.
+
+    Useful in instances where characters are represented by multiple tokens and
+    their byte representations must be combined to generate the correct text
+    representation. Can be `null` if there is no bytes representation for the token.
+    """
+
+    logprob: float
+    """The log probability of this token, if it is within the top 20 most likely
+    tokens.
+
+    Otherwise, the value `-9999.0` is used to signify that the token is very
+    unlikely.
+    """
+
+
+class ChatCompletionTokenLogprob(OpenAIObject):
+    token: str
+    """The token."""
+
+    bytes: Optional[List[int]] = None
+    """A list of integers representing the UTF-8 bytes representation of the token.
+
+    Useful in instances where characters are represented by multiple tokens and
+    their byte representations must be combined to generate the correct text
+    representation. Can be `null` if there is no bytes representation for the token.
+    """
+
+    logprob: float
+    """The log probability of this token, if it is within the top 20 most likely
+    tokens.
+
+    Otherwise, the value `-9999.0` is used to signify that the token is very
+    unlikely.
+    """
+
+    top_logprobs: List[TopLogprob]
+    """List of the most likely tokens and their log probability, at this token
+    position.
+
+    In rare cases, there may be fewer than the number of requested `top_logprobs`
+    returned.
+    """
+
+
+class ChoiceLogprobs(OpenAIObject):
+    content: Optional[List[ChatCompletionTokenLogprob]] = None
+    """A list of message content tokens with log probability information."""
+
+
 class FunctionCall(OpenAIObject):
    arguments: str
    name: Optional[str] = None
@ -320,19 +378,19 @@ class Message(OpenAIObject):
        super(Message, self).__init__(**params)
        self.content = content
        self.role = role
+        self.tool_calls = None
+        self.function_call = None
+
        if function_call is not None:
            self.function_call = FunctionCall(**function_call)

        if tool_calls is not None:
-            self.tool_calls = []
-            for tool_call in tool_calls:
-                if isinstance(tool_call, dict):
-                    self.tool_calls.append(ChatCompletionMessageToolCall(**tool_call))
-                else:
-                    self.tool_calls.append(tool_call)
+            self.tool_calls = [
+                ChatCompletionMessageToolCall(**tool_call) for tool_call in tool_calls
+            ]

        if logprobs is not None:
-            self._logprobs = logprobs
+            self._logprobs = ChoiceLogprobs(**logprobs)

    def get(self, key, default=None):
        # Custom .get() method to access attributes with a default value if the attribute doesn't exist
@ -355,12 +413,20 @@ class Message(OpenAIObject):


 class Delta(OpenAIObject):
+    tool_calls: Optional[List[ChatCompletionDeltaToolCall]] = None
+
    def __init__(
-        self, content=None, role=None, function_call=None, tool_calls=None, **params
+        self,
+        content=None,
+        role=None,
+        function_call=None,
+        tool_calls=None,
+        **params,
    ):
        super(Delta, self).__init__(**params)
        self.content = content
        self.role = role
+
        if function_call is not None and isinstance(function_call, dict):
            self.function_call = FunctionCall(**function_call)
        else:
@ -410,7 +476,7 @@ class Choices(OpenAIObject):
        )  # set finish_reason for all responses
        self.index = index
        if message is None:
-            self.message = Message(content=None)
+            self.message = Message()
        else:
            if isinstance(message, Message):
                self.message = message
@ -492,7 +558,11 @@ class StreamingChoices(OpenAIObject):
            self.delta = Delta()
        if enhancements is not None:
            self.enhancements = enhancements
-        self.logprobs = logprobs
+
+        if logprobs is not None and isinstance(logprobs, dict):
+            self.logprobs = ChoiceLogprobs(**logprobs)
+        else:
+            self.logprobs = logprobs  # type: ignore

    def __contains__(self, key):
        # Define custom behavior for the 'in' operator
@ -1139,6 +1209,13 @@ class Logging:
            if verbose_logger.level == 0:
                # this means verbose logger was not switched on - user is in litellm.set_verbose=True
                print_verbose(f"\033[92m{curl_command}\033[0m\n")
+
+            if litellm.json_logs:
+                verbose_logger.info(
+                    "POST Request Sent from LiteLLM",
+                    extra={"api_base": {api_base}, **masked_headers},
+                )
+            else:
                verbose_logger.info(f"\033[92m{curl_command}\033[0m\n")
            if self.logger_fn and callable(self.logger_fn):
                try:
@ -1149,7 +1226,6 @@ class Logging:
                    print_verbose(
                        f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while logging {traceback.format_exc()}"
                    )
-
            # Input Integration Logging -> If you want to log the fact that an attempt to call the model was made
            callbacks = litellm.input_callback + self.dynamic_input_callbacks
            for callback in callbacks:
@ -1166,29 +1242,20 @@ class Logging:
                            litellm_call_id=self.litellm_params["litellm_call_id"],
                            print_verbose=print_verbose,
                        )
-
-                    elif callback == "lite_debugger":
-                        print_verbose(
-                            f"reaches litedebugger for logging! - model_call_details {self.model_call_details}"
-                        )
-                        model = self.model_call_details["model"]
-                        messages = self.model_call_details["input"]
-                        print_verbose(f"liteDebuggerClient: {liteDebuggerClient}")
-                        liteDebuggerClient.input_log_event(
-                            model=model,
-                            messages=messages,
-                            end_user=self.model_call_details.get("user", "default"),
-                            litellm_call_id=self.litellm_params["litellm_call_id"],
-                            litellm_params=self.model_call_details["litellm_params"],
-                            optional_params=self.model_call_details["optional_params"],
-                            print_verbose=print_verbose,
-                            call_type=self.call_type,
-                        )
                    elif callback == "sentry" and add_breadcrumb:
-                        print_verbose("reaches sentry breadcrumbing")
+                        try:
+                            details_to_log = copy.deepcopy(self.model_call_details)
+                        except:
+                            details_to_log = self.model_call_details
+                        if litellm.turn_off_message_logging:
+                            # make a copy of the _model_Call_details and log it
+                            details_to_log.pop("messages", None)
+                            details_to_log.pop("input", None)
+                            details_to_log.pop("prompt", None)
+
                        add_breadcrumb(
                            category="litellm.llm_call",
-                            message=f"Model Call Details pre-call: {self.model_call_details}",
+                            message=f"Model Call Details pre-call: {details_to_log}",
                            level="info",
                        )
                    elif isinstance(callback, CustomLogger):  # custom logger class
@ -1252,7 +1319,7 @@ class Logging:
                    print_verbose(
                        f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while logging {traceback.format_exc()}"
                    )
-
+            self.redact_message_input_output_from_logging(result=original_response)
            # Input Integration Logging -> If you want to log the fact that an attempt to call the model was made

            callbacks = litellm.input_callback + self.dynamic_input_callbacks
@ -1270,9 +1337,19 @@ class Logging:
                        )
                    elif callback == "sentry" and add_breadcrumb:
                        print_verbose("reaches sentry breadcrumbing")
+                        try:
+                            details_to_log = copy.deepcopy(self.model_call_details)
+                        except:
+                            details_to_log = self.model_call_details
+                        if litellm.turn_off_message_logging:
+                            # make a copy of the _model_Call_details and log it
+                            details_to_log.pop("messages", None)
+                            details_to_log.pop("input", None)
+                            details_to_log.pop("prompt", None)
+
                        add_breadcrumb(
                            category="litellm.llm_call",
-                            message=f"Model Call Details post-call: {self.model_call_details}",
+                            message=f"Model Call Details post-call: {details_to_log}",
                            level="info",
                        )
                    elif isinstance(callback, CustomLogger):  # custom logger class
@ -1464,6 +1541,8 @@ class Logging:
            else:
                callbacks = litellm.success_callback

+            self.redact_message_input_output_from_logging(result=result)
+
            for callback in callbacks:
                try:
                    litellm_params = self.model_call_details.get("litellm_params", {})
@ -1850,6 +1929,51 @@ class Logging:
                                end_time=end_time,
                                print_verbose=print_verbose,
                            )
+                    if (
+                        callback == "openmeter"
+                        and self.model_call_details.get("litellm_params", {}).get(
+                            "acompletion", False
+                        )
+                        == False
+                        and self.model_call_details.get("litellm_params", {}).get(
+                            "aembedding", False
+                        )
+                        == False
+                        and self.model_call_details.get("litellm_params", {}).get(
+                            "aimage_generation", False
+                        )
+                        == False
+                        and self.model_call_details.get("litellm_params", {}).get(
+                            "atranscription", False
+                        )
+                        == False
+                    ):
+                        global openMeterLogger
+                        if openMeterLogger is None:
+                            print_verbose("Instantiates openmeter client")
+                            openMeterLogger = OpenMeterLogger()
+                        if self.stream and complete_streaming_response is None:
+                            openMeterLogger.log_stream_event(
+                                kwargs=self.model_call_details,
+                                response_obj=result,
+                                start_time=start_time,
+                                end_time=end_time,
+                            )
+                        else:
+                            if self.stream and complete_streaming_response:
+                                self.model_call_details["complete_response"] = (
+                                    self.model_call_details.get(
+                                        "complete_streaming_response", {}
+                                    )
+                                )
+                                result = self.model_call_details["complete_response"]
+                            openMeterLogger.log_success_event(
+                                kwargs=self.model_call_details,
+                                response_obj=result,
+                                start_time=start_time,
+                                end_time=end_time,
+                            )
+
                    if (
                        isinstance(callback, CustomLogger)
                        and self.model_call_details.get("litellm_params", {}).get(
@ -2008,7 +2132,9 @@ class Logging:
                    callbacks.append(callback)
        else:
            callbacks = litellm._async_success_callback
-        print_verbose(f"Async success callbacks: {callbacks}")
+
+        self.redact_message_input_output_from_logging(result=result)
+
        for callback in callbacks:
            # check if callback can run for this request
            litellm_params = self.model_call_details.get("litellm_params", {})
@ -2046,6 +2172,35 @@ class Logging:
                                await litellm.cache.async_add_cache(result, **kwargs)
                            else:
                                litellm.cache.add_cache(result, **kwargs)
+                if callback == "openmeter":
+                    global openMeterLogger
+                    if self.stream == True:
+                        if (
+                            "async_complete_streaming_response"
+                            in self.model_call_details
+                        ):
+                            await openMeterLogger.async_log_success_event(
+                                kwargs=self.model_call_details,
+                                response_obj=self.model_call_details[
+                                    "async_complete_streaming_response"
+                                ],
+                                start_time=start_time,
+                                end_time=end_time,
+                            )
+                        else:
+                            await openMeterLogger.async_log_stream_event(  # [TODO]: move this to being an async log stream event function
+                                kwargs=self.model_call_details,
+                                response_obj=result,
+                                start_time=start_time,
+                                end_time=end_time,
+                            )
+                    else:
+                        await openMeterLogger.async_log_success_event(
+                            kwargs=self.model_call_details,
+                            response_obj=result,
+                            start_time=start_time,
+                            end_time=end_time,
+                        )
                if isinstance(callback, CustomLogger):  # custom logger class
                    if self.stream == True:
                        if (
@ -2169,7 +2324,10 @@ class Logging:
                start_time=start_time,
                end_time=end_time,
            )
+
            result = None  # result sent to all loggers, init this to None incase it's not created
+
+            self.redact_message_input_output_from_logging(result=result)
            for callback in litellm.failure_callback:
                try:
                    if callback == "lite_debugger":
@ -2354,6 +2512,39 @@ class Logging:
                    f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while success logging {traceback.format_exc()}"
                )

+    def redact_message_input_output_from_logging(self, result):
+        """
+        Removes messages, prompts, input, response from logging. This modifies the data in-place
+        only redacts when litellm.turn_off_message_logging == True
+        """
+        # check if user opted out of logging message/response to callbacks
+        if litellm.turn_off_message_logging == True:
+            # remove messages, prompts, input, response from logging
+            self.model_call_details["messages"] = "redacted-by-litellm"
+            self.model_call_details["prompt"] = ""
+            self.model_call_details["input"] = ""
+
+            # response cleaning
+            # ChatCompletion Responses
+            if self.stream and "complete_streaming_response" in self.model_call_details:
+                _streaming_response = self.model_call_details[
+                    "complete_streaming_response"
+                ]
+                for choice in _streaming_response.choices:
+                    if isinstance(choice, litellm.Choices):
+                        choice.message.content = "redacted-by-litellm"
+                    elif isinstance(choice, litellm.utils.StreamingChoices):
+                        choice.delta.content = "redacted-by-litellm"
+            else:
+                if result is not None:
+                    if isinstance(result, litellm.ModelResponse):
+                        if hasattr(result, "choices") and result.choices is not None:
+                            for choice in result.choices:
+                                if isinstance(choice, litellm.Choices):
+                                    choice.message.content = "redacted-by-litellm"
+                                elif isinstance(choice, litellm.utils.StreamingChoices):
+                                    choice.delta.content = "redacted-by-litellm"
+

 def exception_logging(
    additional_args={},
@ -2436,7 +2627,7 @@ class Rules:
 ####### CLIENT ###################
 # make it easy to log if completion/embedding runs succeeded or failed + see what happened | Non-Blocking
 def function_setup(
-    original_function, rules_obj, start_time, *args, **kwargs
+    original_function: str, rules_obj, start_time, *args, **kwargs
 ):  # just run once to check if user wants to send their data anywhere - PostHog/Sentry/Slack/etc.
    try:
        global callback_list, add_breadcrumb, user_logger_fn, Logging
@ -2460,10 +2651,12 @@ def function_setup(
            len(litellm.input_callback) > 0
            or len(litellm.success_callback) > 0
            or len(litellm.failure_callback) > 0
-        ) and len(callback_list) == 0:
+        ) and len(
+            callback_list  # type: ignore
+        ) == 0:  # type: ignore
            callback_list = list(
                set(
-                    litellm.input_callback
+                    litellm.input_callback  # type: ignore
                    + litellm.success_callback
                    + litellm.failure_callback
                )
@ -2472,7 +2665,7 @@ def function_setup(
        ## ASYNC CALLBACKS
        if len(litellm.input_callback) > 0:
            removed_async_items = []
-            for index, callback in enumerate(litellm.input_callback):
+            for index, callback in enumerate(litellm.input_callback):  # type: ignore
                if inspect.iscoroutinefunction(callback):
                    litellm._async_input_callback.append(callback)
                    removed_async_items.append(index)
@ -2483,11 +2676,11 @@ def function_setup(

        if len(litellm.success_callback) > 0:
            removed_async_items = []
-            for index, callback in enumerate(litellm.success_callback):
+            for index, callback in enumerate(litellm.success_callback):  # type: ignore
                if inspect.iscoroutinefunction(callback):
                    litellm._async_success_callback.append(callback)
                    removed_async_items.append(index)
-                elif callback == "dynamodb":
+                elif callback == "dynamodb" or callback == "openmeter":
                    # dynamo is an async callback, it's used for the proxy and needs to be async
                    # we only support async dynamo db logging for acompletion/aembedding since that's used on proxy
                    litellm._async_success_callback.append(callback)
@ -2499,7 +2692,7 @@ def function_setup(

        if len(litellm.failure_callback) > 0:
            removed_async_items = []
-            for index, callback in enumerate(litellm.failure_callback):
+            for index, callback in enumerate(litellm.failure_callback):  # type: ignore
                if inspect.iscoroutinefunction(callback):
                    litellm._async_failure_callback.append(callback)
                    removed_async_items.append(index)
@ -2533,16 +2726,26 @@ def function_setup(
            dynamic_success_callbacks = kwargs.pop("success_callback")

        if add_breadcrumb:
+            try:
+                details_to_log = copy.deepcopy(kwargs)
+            except:
+                details_to_log = kwargs
+
+            if litellm.turn_off_message_logging:
+                # make a copy of the _model_Call_details and log it
+                details_to_log.pop("messages", None)
+                details_to_log.pop("input", None)
+                details_to_log.pop("prompt", None)
            add_breadcrumb(
                category="litellm.llm_call",
-                message=f"Positional Args: {args}, Keyword Args: {kwargs}",
+                message=f"Positional Args: {args}, Keyword Args: {details_to_log}",
                level="info",
            )
        if "logger_fn" in kwargs:
            user_logger_fn = kwargs["logger_fn"]
        # INIT LOGGER - for user-specified integrations
        model = args[0] if len(args) > 0 else kwargs.get("model", None)
-        call_type = original_function.__name__
+        call_type = original_function
        if (
            call_type == CallTypes.completion.value
            or call_type == CallTypes.acompletion.value
@ -2724,7 +2927,7 @@ def client(original_function):
        try:
            if logging_obj is None:
                logging_obj, kwargs = function_setup(
-                    original_function, rules_obj, start_time, *args, **kwargs
+                    original_function.__name__, rules_obj, start_time, *args, **kwargs
                )
            kwargs["litellm_logging_obj"] = logging_obj

@ -3033,7 +3236,7 @@ def client(original_function):
        try:
            if logging_obj is None:
                logging_obj, kwargs = function_setup(
-                    original_function, rules_obj, start_time, *args, **kwargs
+                    original_function.__name__, rules_obj, start_time, *args, **kwargs
                )
            kwargs["litellm_logging_obj"] = logging_obj

@ -3540,12 +3743,12 @@ def get_replicate_completion_pricing(completion_response=None, total_time=0.0):
    a100_80gb_price_per_second_public = (
        0.001400  # assume all calls sent to A100 80GB for now
    )
-    if total_time == 0.0:
+    if total_time == 0.0:  # total time is in ms
        start_time = completion_response["created"]
        end_time = completion_response["ended"]
        total_time = end_time - start_time

-    return a100_80gb_price_per_second_public * total_time
+    return a100_80gb_price_per_second_public * total_time / 1000


 def _select_tokenizer(model: str):
@ -3567,7 +3770,7 @@ def _select_tokenizer(model: str):
        tokenizer = Tokenizer.from_str(json_str)
        return {"type": "huggingface_tokenizer", "tokenizer": tokenizer}
    # llama2
-    elif "llama-2" in model.lower():
+    elif "llama-2" in model.lower() or "replicate" in model.lower():
        tokenizer = Tokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
        return {"type": "huggingface_tokenizer", "tokenizer": tokenizer}
    # default - tiktoken
@ -4168,7 +4371,10 @@ def completion_cost(
            model = get_model_params_and_category(model)
        # replicate llms are calculate based on time for request running
        # see https://replicate.com/pricing
-        elif model in litellm.replicate_models or "replicate" in model:
+        elif (
+            model in litellm.replicate_models or "replicate" in model
+        ) and model not in litellm.model_cost:
+            # for unmapped replicate model, default to replicate's time tracking logic
            return get_replicate_completion_pricing(completion_response, total_time)

        (
@ -4554,7 +4760,36 @@ def get_optional_params(
            k.startswith("vertex_") and custom_llm_provider != "vertex_ai"
        ):  # allow dynamically setting vertex ai init logic
            continue
+
        passed_params[k] = v
+
+    optional_params = {}
+
+    common_auth_dict = litellm.common_cloud_provider_auth_params
+    if custom_llm_provider in common_auth_dict["providers"]:
+        """
+        Check if params = ["project", "region_name", "token"]
+        and correctly translate for = ["azure", "vertex_ai", "watsonx", "aws"]
+        """
+        if custom_llm_provider == "azure":
+            optional_params = litellm.AzureOpenAIConfig().map_special_auth_params(
+                non_default_params=passed_params, optional_params=optional_params
+            )
+        elif custom_llm_provider == "bedrock":
+            optional_params = (
+                litellm.AmazonBedrockGlobalConfig().map_special_auth_params(
+                    non_default_params=passed_params, optional_params=optional_params
+                )
+            )
+        elif custom_llm_provider == "vertex_ai":
+            optional_params = litellm.VertexAIConfig().map_special_auth_params(
+                non_default_params=passed_params, optional_params=optional_params
+            )
+        elif custom_llm_provider == "watsonx":
+            optional_params = litellm.IBMWatsonXAIConfig().map_special_auth_params(
+                non_default_params=passed_params, optional_params=optional_params
+            )
+
    default_params = {
        "functions": None,
        "function_call": None,
@ -4590,7 +4825,7 @@ def get_optional_params(
            and v != default_params[k]
        )
    }
-    optional_params = {}
+
    ## raise exception if function calling passed in for a provider that doesn't support it
    if (
        "functions" in non_default_params
@ -5268,7 +5503,8 @@ def get_optional_params(
            optional_params["tools"] = tools
        if tool_choice is not None:
            optional_params["tool_choice"] = tool_choice
-
+        if response_format is not None:
+            optional_params["response_format"] = response_format
        # check safe_mode, random_seed: https://docs.mistral.ai/api/#operation/createChatCompletion
        safe_mode = passed_params.pop("safe_mode", None)
        random_seed = passed_params.pop("random_seed", None)
@ -5280,6 +5516,7 @@ def get_optional_params(
        optional_params["extra_body"] = (
            extra_body  # openai client supports `extra_body` param
        )
+
    elif custom_llm_provider == "groq":
        supported_params = get_supported_openai_params(
            model=model, custom_llm_provider=custom_llm_provider
@ -5360,6 +5597,49 @@ def get_optional_params(
        optional_params["extra_body"] = (
            extra_body  # openai client supports `extra_body` param
        )
+    elif custom_llm_provider == "watsonx":
+        supported_params = get_supported_openai_params(
+            model=model, custom_llm_provider=custom_llm_provider
+        )
+        _check_valid_arg(supported_params=supported_params)
+        if max_tokens is not None:
+            optional_params["max_new_tokens"] = max_tokens
+        if stream:
+            optional_params["stream"] = stream
+        if temperature is not None:
+            optional_params["temperature"] = temperature
+        if top_p is not None:
+            optional_params["top_p"] = top_p
+        if frequency_penalty is not None:
+            optional_params["repetition_penalty"] = frequency_penalty
+        if seed is not None:
+            optional_params["random_seed"] = seed
+        if stop is not None:
+            optional_params["stop_sequences"] = stop
+
+        # WatsonX-only parameters
+        extra_body = {}
+        if "decoding_method" in passed_params:
+            extra_body["decoding_method"] = passed_params.pop("decoding_method")
+        if "min_tokens" in passed_params or "min_new_tokens" in passed_params:
+            extra_body["min_new_tokens"] = passed_params.pop(
+                "min_tokens", passed_params.pop("min_new_tokens")
+            )
+        if "top_k" in passed_params:
+            extra_body["top_k"] = passed_params.pop("top_k")
+        if "truncate_input_tokens" in passed_params:
+            extra_body["truncate_input_tokens"] = passed_params.pop(
+                "truncate_input_tokens"
+            )
+        if "length_penalty" in passed_params:
+            extra_body["length_penalty"] = passed_params.pop("length_penalty")
+        if "time_limit" in passed_params:
+            extra_body["time_limit"] = passed_params.pop("time_limit")
+        if "return_options" in passed_params:
+            extra_body["return_options"] = passed_params.pop("return_options")
+        optional_params["extra_body"] = (
+            extra_body  # openai client supports `extra_body` param
+        )
    else:  # assume passing in params for openai/azure openai
        print_verbose(
            f"UNMAPPED PROVIDER, ASSUMING IT'S OPENAI/AZURE - model={model}, custom_llm_provider={custom_llm_provider}"
@ -5762,6 +6042,8 @@ def get_supported_openai_params(model: str, custom_llm_provider: str):
            "frequency_penalty",
            "presence_penalty",
        ]
+    elif custom_llm_provider == "watsonx":
+        return litellm.IBMWatsonXAIConfig().get_supported_openai_params()


 def get_formatted_prompt(
@ -5989,6 +6271,8 @@ def get_llm_provider(
            model in litellm.bedrock_models or model in litellm.bedrock_embedding_models
        ):
            custom_llm_provider = "bedrock"
+        elif model in litellm.watsonx_models:
+            custom_llm_provider = "watsonx"
        # openai embeddings
        elif model in litellm.open_ai_embedding_models:
            custom_llm_provider = "openai"
@ -6453,7 +6737,7 @@ def validate_environment(model: Optional[str] = None) -> dict:
            if "VERTEXAI_PROJECT" in os.environ and "VERTEXAI_LOCATION" in os.environ:
                keys_in_environment = True
            else:
-                missing_keys.extend(["VERTEXAI_PROJECT", "VERTEXAI_PROJECT"])
+                missing_keys.extend(["VERTEXAI_PROJECT", "VERTEXAI_LOCATION"])
        elif custom_llm_provider == "huggingface":
            if "HUGGINGFACE_API_KEY" in os.environ:
                keys_in_environment = True
@ -6579,11 +6863,11 @@ def validate_environment(model: Optional[str] = None) -> dict:

 def set_callbacks(callback_list, function_id=None):

-    global sentry_sdk_instance, capture_exception, add_breadcrumb, posthog, slack_app, alerts_channel, traceloopLogger, athinaLogger, heliconeLogger, aispendLogger, berrispendLogger, supabaseClient, liteDebuggerClient, lunaryLogger, promptLayerLogger, langFuseLogger, customLogger, weightsBiasesLogger, langsmithLogger, dynamoLogger, s3Logger, dataDogLogger, prometheusLogger, greenscaleLogger
+    global sentry_sdk_instance, capture_exception, add_breadcrumb, posthog, slack_app, alerts_channel, traceloopLogger, athinaLogger, heliconeLogger, aispendLogger, berrispendLogger, supabaseClient, liteDebuggerClient, lunaryLogger, promptLayerLogger, langFuseLogger, customLogger, weightsBiasesLogger, langsmithLogger, dynamoLogger, s3Logger, dataDogLogger, prometheusLogger, greenscaleLogger, openMeterLogger

    try:
        for callback in callback_list:
-            print_verbose(f"callback: {callback}")
+            print_verbose(f"init callback list: {callback}")
            if callback == "sentry":
                try:
                    import sentry_sdk
@ -6646,6 +6930,8 @@ def set_callbacks(callback_list, function_id=None):
                promptLayerLogger = PromptLayerLogger()
            elif callback == "langfuse":
                langFuseLogger = LangFuseLogger()
+            elif callback == "openmeter":
+                openMeterLogger = OpenMeterLogger()
            elif callback == "datadog":
                dataDogLogger = DataDogLogger()
            elif callback == "prometheus":
@ -6982,6 +7268,7 @@ def convert_to_model_response_object(
    end_time=None,
    hidden_params: Optional[dict] = None,
 ):
+    received_args = locals()
    try:
        if response_type == "completion" and (
            model_response_object is None
@ -6993,6 +7280,11 @@ def convert_to_model_response_object(
                # for returning cached responses, we need to yield a generator
                return convert_to_streaming_response(response_object=response_object)
            choice_list = []
+
+            assert response_object["choices"] is not None and isinstance(
+                response_object["choices"], Iterable
+            )
+
            for idx, choice in enumerate(response_object["choices"]):
                message = Message(
                    content=choice["message"].get("content", None),
@ -7036,6 +7328,7 @@ def convert_to_model_response_object(
                model_response_object.model = response_object["model"]

            if start_time is not None and end_time is not None:
+                if isinstance(start_time, type(end_time)):
                    model_response_object._response_ms = (  # type: ignore
                        end_time - start_time
                    ).total_seconds() * 1000
@ -7113,7 +7406,9 @@ def convert_to_model_response_object(
                model_response_object._hidden_params = hidden_params
            return model_response_object
    except Exception as e:
-        raise Exception(f"Invalid response object {traceback.format_exc()}")
+        raise Exception(
+            f"Invalid response object {traceback.format_exc()}\n\nreceived_args={received_args}"
+        )


 def acreate(*args, **kwargs):  ## Thin client to handle the acreate langchain call
@ -7940,7 +8235,10 @@ def exception_type(
                        llm_provider="vertex_ai",
                        response=original_exception.response,
                    )
-                elif "None Unknown Error." in error_str:
+                elif (
+                    "None Unknown Error." in error_str
+                    or "Content has no parts." in error_str
+                ):
                    exception_mapping_worked = True
                    raise APIError(
                        message=f"VertexAIException - {error_str}",
@ -9393,9 +9691,14 @@ class CustomStreamWrapper:
                    is_finished = True
                    finish_reason = str_line.choices[0].finish_reason
                    if finish_reason == "content_filter":
+                        if hasattr(str_line.choices[0], "content_filter_result"):
                            error_message = json.dumps(
                                str_line.choices[0].content_filter_result
                            )
+                        else:
+                            error_message = "Azure Response={}".format(
+                                str(dict(str_line))
+                            )
                        raise litellm.AzureOpenAIError(
                            status_code=400, message=error_message
                        )
@ -9683,6 +9986,39 @@ class CustomStreamWrapper:
                "finish_reason": finish_reason,
            }

+    def handle_watsonx_stream(self, chunk):
+        try:
+            if isinstance(chunk, dict):
+                parsed_response = chunk
+            elif isinstance(chunk, (str, bytes)):
+                if isinstance(chunk, bytes):
+                    chunk = chunk.decode("utf-8")
+                if "generated_text" in chunk:
+                    response = chunk.replace("data: ", "").strip()
+                    parsed_response = json.loads(response)
+                else:
+                    return {"text": "", "is_finished": False}
+            else:
+                print_verbose(f"chunk: {chunk} (Type: {type(chunk)})")
+                raise ValueError(
+                    f"Unable to parse response. Original response: {chunk}"
+                )
+            results = parsed_response.get("results", [])
+            if len(results) > 0:
+                text = results[0].get("generated_text", "")
+                finish_reason = results[0].get("stop_reason")
+                is_finished = finish_reason != "not_finished"
+                return {
+                    "text": text,
+                    "is_finished": is_finished,
+                    "finish_reason": finish_reason,
+                    "prompt_tokens": results[0].get("input_token_count", None),
+                    "completion_tokens": results[0].get("generated_token_count", None),
+                }
+            return {"text": "", "is_finished": False}
+        except Exception as e:
+            raise e
+
    def model_response_creator(self):
        model_response = ModelResponse(stream=True, model=self.model)
        if self.response_id is not None:
@ -9938,6 +10274,11 @@ class CustomStreamWrapper:
                print_verbose(f"completion obj content: {completion_obj['content']}")
                if response_obj["is_finished"]:
                    self.received_finish_reason = response_obj["finish_reason"]
+            elif self.custom_llm_provider == "watsonx":
+                response_obj = self.handle_watsonx_stream(chunk)
+                completion_obj["content"] = response_obj["text"]
+                if response_obj["is_finished"]:
+                    self.received_finish_reason = response_obj["finish_reason"]
            elif self.custom_llm_provider == "text-completion-openai":
                response_obj = self.handle_openai_text_completion_chunk(chunk)
                completion_obj["content"] = response_obj["text"]
@ -10123,12 +10464,23 @@ class CustomStreamWrapper:
                        model_response.id = original_chunk.id
                        self.response_id = original_chunk.id
                        if len(original_chunk.choices) > 0:
+                            choices = []
+                            for idx, choice in enumerate(original_chunk.choices):
                                try:
-                                delta = dict(original_chunk.choices[0].delta)
-                                print_verbose(f"original delta: {delta}")
-                                model_response.choices[0].delta = Delta(**delta)
+                                    if isinstance(choice, BaseModel):
+                                        try:
+                                            choice_json = choice.model_dump()
                                        except Exception as e:
-                                model_response.choices[0].delta = Delta()
+                                            choice_json = choice.dict()
+                                        choice_json.pop(
+                                            "finish_reason", None
+                                        )  # for mistral etc. which return a value in their last chunk (not-openai compatible).
+                                        print_verbose(f"choice_json: {choice_json}")
+                                        choices.append(StreamingChoices(**choice_json))
+                                except Exception as e:
+                                    choices.append(StreamingChoices())
+                            print_verbose(f"choices in streaming: {choices}")
+                            model_response.choices = choices
                        else:
                            return
                        model_response.system_fingerprint = (
@ -10173,11 +10525,11 @@ class CustomStreamWrapper:
                        )
                    self.holding_chunk = ""
                # if delta is None
-                is_delta_empty = self.is_delta_empty(
+                _is_delta_empty = self.is_delta_empty(
                    delta=model_response.choices[0].delta
                )

-                if is_delta_empty:
+                if _is_delta_empty:
                    # get any function call arguments
                    model_response.choices[0].finish_reason = map_finish_reason(
                        finish_reason=self.received_finish_reason
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@ -1418,6 +1418,123 @@
        "litellm_provider": "replicate",
        "mode": "chat"
    },
+    "replicate/meta/llama-2-13b": {
+        "max_tokens": 4096,
+        "max_input_tokens": 4096,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.0000001,
+        "output_cost_per_token": 0.0000005,
+        "litellm_provider": "replicate",
+        "mode": "chat"
+    },
+    "replicate/meta/llama-2-13b-chat": {
+        "max_tokens": 4096,
+        "max_input_tokens": 4096,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.0000001,
+        "output_cost_per_token": 0.0000005,
+        "litellm_provider": "replicate",
+        "mode": "chat"
+    },
+    "replicate/meta/llama-2-70b": {
+        "max_tokens": 4096,
+        "max_input_tokens": 4096,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.00000065,
+        "output_cost_per_token": 0.00000275,
+        "litellm_provider": "replicate",
+        "mode": "chat"
+    },
+    "replicate/meta/llama-2-70b-chat": {
+        "max_tokens": 4096,
+        "max_input_tokens": 4096,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.00000065,
+        "output_cost_per_token": 0.00000275,
+        "litellm_provider": "replicate",
+        "mode": "chat"
+    },
+    "replicate/meta/llama-2-7b": {
+        "max_tokens": 4096,
+        "max_input_tokens": 4096,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.00000005,
+        "output_cost_per_token": 0.00000025,
+        "litellm_provider": "replicate",
+        "mode": "chat"
+    },
+    "replicate/meta/llama-2-7b-chat": {
+        "max_tokens": 4096,
+        "max_input_tokens": 4096,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.00000005,
+        "output_cost_per_token": 0.00000025,
+        "litellm_provider": "replicate",
+        "mode": "chat"
+    },
+    "replicate/meta/llama-3-70b": {
+        "max_tokens": 4096,
+        "max_input_tokens": 4096,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.00000065,
+        "output_cost_per_token": 0.00000275,
+        "litellm_provider": "replicate",
+        "mode": "chat"
+    },
+    "replicate/meta/llama-3-70b-instruct": {
+        "max_tokens": 4096,
+        "max_input_tokens": 4096,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.00000065,
+        "output_cost_per_token": 0.00000275,
+        "litellm_provider": "replicate",
+        "mode": "chat"
+    },
+    "replicate/meta/llama-3-8b": {
+        "max_tokens": 4096,
+        "max_input_tokens": 4096,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.00000005,
+        "output_cost_per_token": 0.00000025,
+        "litellm_provider": "replicate",
+        "mode": "chat"
+    },
+    "replicate/meta/llama-3-8b-instruct": {
+        "max_tokens": 4096,
+        "max_input_tokens": 4096,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.00000005,
+        "output_cost_per_token": 0.00000025,
+        "litellm_provider": "replicate",
+        "mode": "chat"
+    },
+    "replicate/mistralai/mistral-7b-v0.1": {
+        "max_tokens": 4096,
+        "max_input_tokens": 4096,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.00000005,
+        "output_cost_per_token": 0.00000025,
+        "litellm_provider": "replicate",
+        "mode": "chat"
+    },
+    "replicate/mistralai/mistral-7b-instruct-v0.2": {
+        "max_tokens": 4096,
+        "max_input_tokens": 4096,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.00000005,
+        "output_cost_per_token": 0.00000025,
+        "litellm_provider": "replicate",
+        "mode": "chat"
+    },
+    "replicate/mistralai/mixtral-8x7b-instruct-v0.1": {
+        "max_tokens": 4096,
+        "max_input_tokens": 4096,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.0000003,
+        "output_cost_per_token": 0.000001,
+        "litellm_provider": "replicate",
+        "mode": "chat"
+    },
    "openrouter/openai/gpt-3.5-turbo": {
        "max_tokens": 4095,
        "input_cost_per_token": 0.0000015,
@ -1455,6 +1572,17 @@
        "litellm_provider": "openrouter",
        "mode": "chat"
    },
+    "openrouter/anthropic/claude-3-opus": {
+        "max_tokens": 4096,
+        "max_input_tokens": 200000,
+        "max_output_tokens": 4096,
+        "input_cost_per_token": 0.000015,
+        "output_cost_per_token": 0.000075,
+        "litellm_provider": "openrouter",
+        "mode": "chat",
+        "supports_function_calling": true,
+        "tool_use_system_prompt_tokens": 395
+    },
    "openrouter/google/palm-2-chat-bison": {
        "max_tokens": 8000,
        "input_cost_per_token": 0.0000005,
@ -2379,6 +2507,24 @@
        "litellm_provider": "bedrock",
        "mode": "chat"
    },
+    "meta.llama3-8b-instruct-v1:0": {
+        "max_tokens": 8192, 
+        "max_input_tokens": 8192, 
+        "max_output_tokens": 8192, 
+        "input_cost_per_token": 0.0000004,
+        "output_cost_per_token": 0.0000006,
+        "litellm_provider": "bedrock",
+        "mode": "chat"
+    },
+    "meta.llama3-70b-instruct-v1:0": {
+        "max_tokens": 8192, 
+        "max_input_tokens": 8192, 
+        "max_output_tokens": 8192, 
+        "input_cost_per_token": 0.00000265,
+        "output_cost_per_token": 0.0000035,
+        "litellm_provider": "bedrock",
+        "mode": "chat"
+    },
    "512-x-512/50-steps/stability.stable-diffusion-xl-v0": {
        "max_tokens": 77, 
        "max_input_tokens": 77, 
--- a/proxy_server_config.yaml
+++ b/proxy_server_config.yaml
@ -61,14 +61,14 @@ model_list:
      api_key: my-fake-key
      api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/
      stream_timeout: 0.001
-      rpm: 10
+      rpm: 100
  - model_name: fake-openai-endpoint-3
    litellm_params:
      model: openai/my-fake-model-2
      api_key: my-fake-key
      api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/
      stream_timeout: 0.001
-      rpm: 10
+      rpm: 100
  - model_name: "*"
    litellm_params:
      model: openai/*
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "1.35.27"
+version = "1.35.36"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT"
@ -80,7 +80,7 @@ requires = ["poetry-core", "wheel"]
 build-backend = "poetry.core.masonry.api"

 [tool.commitizen]
-version = "1.35.27"
+version = "1.35.36"
 version_files = [
    "pyproject.toml:^version"
 ]
--- a/Show more
+++ b/Show more
				`@ -0,0 +1 @@`
				`(self.webpackChunk_N_E=self.webpackChunk_N_E\|\|[]).push([[185],{93553:function(n,e,t){Promise.resolve().then(t.t.bind(t,63385,23)),Promise.resolve().then(t.t.bind(t,99646,23))},63385:function(){},99646:function(n){n.exports={style:{fontFamily:"'__Inter_12bbc4', '__Inter_Fallback_12bbc4'",fontStyle:"normal"},className:"__className_12bbc4"}}},function(n){n.O(0,[971,69,744],function(){return n(n.s=93553)}),_N_E=n.O()}]);`
				`@ -1 +0,0 @@`
				`(self.webpackChunk_N_E=self.webpackChunk_N_E\|\|[]).push([[185],{87421:function(n,e,t){Promise.resolve().then(t.t.bind(t,99646,23)),Promise.resolve().then(t.t.bind(t,63385,23))},63385:function(){},99646:function(n){n.exports={style:{fontFamily:"'__Inter_c23dc8', '__Inter_Fallback_c23dc8'",fontStyle:"normal"},className:"__className_c23dc8"}}},function(n){n.O(0,[971,69,744],function(){return n(n.s=87421)}),_N_E=n.O()}]);`