Merge branch 'main' into main

2024-05-02 09:46:34 -03:00 · 2024-05-02 09:46:34 -03:00 · 78303b79ee
commit 78303b79ee
parent a9e2ef6212 caf19478af
124 changed files with 6716 additions and 1078 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -40,7 +40,7 @@ jobs:
            pip install "aioboto3==12.3.0"
            pip install langchain
            pip install lunary==0.2.5
-            pip install "langfuse==2.7.3"
+            pip install "langfuse==2.27.1"
            pip install numpydoc
            pip install traceloop-sdk==0.0.69
            pip install openai
--- a/.gitignore
+++ b/.gitignore
@ -51,3 +51,4 @@ loadtest_kub.yaml
 litellm/proxy/_new_secret_config.yaml
 litellm/proxy/_new_secret_config.yaml
 litellm/proxy/_super_secret_config.yaml
 litellm/proxy/_super_secret_config.yaml
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -7,7 +7,7 @@ repos:
    rev: 7.0.0  # The version of flake8 to use
    hooks:
    -  id: flake8
-       exclude: ^litellm/tests/|^litellm/proxy/proxy_cli.py|^litellm/integrations/|^litellm/proxy/tests/
+       exclude: ^litellm/tests/|^litellm/proxy/proxy_cli.py|^litellm/proxy/tests/
       additional_dependencies: [flake8-print]
       files: litellm/.*\.py
 -   repo: local
--- a/README.md
+++ b/README.md
@ -227,6 +227,7 @@ curl 'http://0.0.0.0:4000/key/generate' \
 | [perplexity-ai](https://docs.litellm.ai/docs/providers/perplexity)                  | ✅                                                      | ✅                                                                              | ✅                                                                                  | ✅                                                                                |
 | [Groq AI](https://docs.litellm.ai/docs/providers/groq)                              | ✅                                                      | ✅                                                                              | ✅                                                                                  | ✅                                                                                |
 | [anyscale](https://docs.litellm.ai/docs/providers/anyscale)                         | ✅                                                      | ✅                                                                              | ✅                                                                                  | ✅                                                                                |
 | [IBM - watsonx.ai](https://docs.litellm.ai/docs/providers/watsonx)                  | ✅                                                      | ✅                                                                              | ✅                                                                                  | ✅                                                                                | ✅ 
 | [voyage ai](https://docs.litellm.ai/docs/providers/voyage)                          |                                                         |                                                                                 |                                                                                     |                                                                                   | ✅                                                                            |
 | [xinference [Xorbits Inference]](https://docs.litellm.ai/docs/providers/xinference) |                                                         |                                                                                 |                                                                                     |                                                                                   | ✅                                                                            |
--- a/cookbook/liteLLM_IBM_Watsonx.ipynb
+++ b/cookbook/liteLLM_IBM_Watsonx.ipynb
--- a/docs/my-website/docs/debugging/local_debugging.md
+++ b/docs/my-website/docs/debugging/local_debugging.md
@ -23,6 +23,14 @@ response = completion(model="gpt-3.5-turbo", messages=messages)
 response = completion("command-nightly", messages)
 ```
 ## JSON Logs 
 If you need to store the logs as JSON, just set the `litellm.json_logs = True`.
 We currently just log the raw POST request from litellm as a JSON - [**See Code**]. 
 [Share feedback here](https://github.com/BerriAI/litellm/issues)
 ## Logger Function 
 But sometimes all you care about is seeing exactly what's getting sent to your api call and what's being returned - e.g. if the api call is failing, why is that happening? what are the exact params being set? 
--- a/docs/my-website/docs/load_test.md
+++ b/docs/my-website/docs/load_test.md
@ -213,3 +213,349 @@ asyncio.run(loadtest_fn())
 ```
 ## Multi-Instance TPM/RPM Load Test (Router)
 Test if your defined tpm/rpm limits are respected across multiple instances of the Router object. 
 In our test:
 - Max RPM per deployment is = 100 requests per minute
 - Max Throughput / min on router = 200 requests per minute (2 deployments)
 - Load we'll send through router = 600 requests per minute
 :::info
 If you don't want to call a real LLM API endpoint, you can setup a fake openai server. [See code](#extra---setup-fake-openai-server)
 :::
 ### Code 
 Let's hit the router with 600 requests per minute. 
 Copy this script 👇. Save it as `test_loadtest_router.py` AND run it with `python3 test_loadtest_router.py`
 ```python
 from litellm import Router 
 import litellm
 litellm.suppress_debug_info = True
 litellm.set_verbose = False
 import logging
 logging.basicConfig(level=logging.CRITICAL)
 import os, random, uuid, time, asyncio
 # Model list for OpenAI and Anthropic models
 model_list = [
    {
        "model_name": "fake-openai-endpoint",
        "litellm_params": {
            "model": "gpt-3.5-turbo",
            "api_key": "my-fake-key",
            "api_base": "http://0.0.0.0:8080",
            "rpm": 100
        },
    },
    {
        "model_name": "fake-openai-endpoint",
        "litellm_params": {
            "model": "gpt-3.5-turbo",
            "api_key": "my-fake-key",
            "api_base": "http://0.0.0.0:8081",
            "rpm": 100
        },
    },
 ]
 router_1 = Router(model_list=model_list, num_retries=0, enable_pre_call_checks=True, routing_strategy="usage-based-routing-v2", redis_host=os.getenv("REDIS_HOST"), redis_port=os.getenv("REDIS_PORT"), redis_password=os.getenv("REDIS_PASSWORD"))
 router_2 = Router(model_list=model_list, num_retries=0, routing_strategy="usage-based-routing-v2", enable_pre_call_checks=True, redis_host=os.getenv("REDIS_HOST"), redis_port=os.getenv("REDIS_PORT"), redis_password=os.getenv("REDIS_PASSWORD"))
 async def router_completion_non_streaming():
  try:
    client: Router = random.sample([router_1, router_2], 1)[0] # randomly pick b/w clients
    # print(f"client={client}")
    response = await client.acompletion(
              model="fake-openai-endpoint", # [CHANGE THIS] (if you call it something else on your proxy)
              messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
          )
    return response
  except Exception as e:
    # print(e)
    return None
 async def loadtest_fn():
    start = time.time()
    n = 600  # Number of concurrent tasks
    tasks = [router_completion_non_streaming() for _ in range(n)]
    chat_completions = await asyncio.gather(*tasks)
    successful_completions = [c for c in chat_completions if c is not None]
    print(n, time.time() - start, len(successful_completions))
 def get_utc_datetime():
    import datetime as dt
    from datetime import datetime
    if hasattr(dt, "UTC"):
        return datetime.now(dt.UTC)  # type: ignore
    else:
        return datetime.utcnow()  # type: ignore
 # Run the event loop to execute the async function
 async def parent_fn():
  for _ in range(10):
    dt = get_utc_datetime()
    current_minute = dt.strftime("%H-%M")
    print(f"triggered new batch - {current_minute}")
    await loadtest_fn()
    await asyncio.sleep(10)
 asyncio.run(parent_fn())
 ```
 ## Multi-Instance TPM/RPM Load Test (Proxy)
 Test if your defined tpm/rpm limits are respected across multiple instances. 
 The quickest way to do this is by testing the [proxy](./proxy/quick_start.md). The proxy uses the [router](./routing.md) under the hood, so if you're using either of them, this test should work for you. 
 In our test:
 - Max RPM per deployment is = 100 requests per minute
 - Max Throughput / min on proxy = 200 requests per minute (2 deployments)
 - Load we'll send to proxy = 600 requests per minute
 So we'll send 600 requests per minute, but expect only 200 requests per minute to succeed.
 :::info
 If you don't want to call a real LLM API endpoint, you can setup a fake openai server. [See code](#extra---setup-fake-openai-server)
 :::
 ### 1. Setup config 
 ```yaml
 model_list:
 - litellm_params:
    api_base: http://0.0.0.0:8080
    api_key: my-fake-key
    model: openai/my-fake-model
    rpm: 100
  model_name: fake-openai-endpoint
 - litellm_params:
    api_base: http://0.0.0.0:8081
    api_key: my-fake-key
    model: openai/my-fake-model-2
    rpm: 100
  model_name: fake-openai-endpoint
 router_settings:
  num_retries: 0
  enable_pre_call_checks: true
  redis_host: os.environ/REDIS_HOST ## 👈 IMPORTANT! Setup the proxy w/ redis
  redis_password: os.environ/REDIS_PASSWORD
  redis_port: os.environ/REDIS_PORT
  routing_strategy: usage-based-routing-v2
 ```
 ### 2. Start proxy 2 instances
 **Instance 1**
 ```bash
 litellm --config /path/to/config.yaml --port 4000
 ## RUNNING on http://0.0.0.0:4000
 ```
 **Instance 2**
 ```bash
 litellm --config /path/to/config.yaml --port 4001
 ## RUNNING on http://0.0.0.0:4001
 ```
 ### 3. Run Test 
 Let's hit the proxy with 600 requests per minute. 
 Copy this script 👇. Save it as `test_loadtest_proxy.py` AND run it with `python3 test_loadtest_proxy.py`
 ```python
 from openai import AsyncOpenAI, AsyncAzureOpenAI
 import random, uuid
 import time, asyncio, litellm
 # import logging
 # logging.basicConfig(level=logging.DEBUG)
 #### LITELLM PROXY #### 
 litellm_client = AsyncOpenAI(
    api_key="sk-1234", # [CHANGE THIS]
    base_url="http://0.0.0.0:4000"
 )
 litellm_client_2 = AsyncOpenAI(
    api_key="sk-1234", # [CHANGE THIS]
    base_url="http://0.0.0.0:4001"
 )
 async def proxy_completion_non_streaming():
  try:
    client = random.sample([litellm_client, litellm_client_2], 1)[0] # randomly pick b/w clients
    # print(f"client={client}")
    response = await client.chat.completions.create(
              model="fake-openai-endpoint", # [CHANGE THIS] (if you call it something else on your proxy)
              messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
          )
    return response
  except Exception as e:
    # print(e)
    return None
 async def loadtest_fn():
    start = time.time()
    n = 600  # Number of concurrent tasks
    tasks = [proxy_completion_non_streaming() for _ in range(n)]
    chat_completions = await asyncio.gather(*tasks)
    successful_completions = [c for c in chat_completions if c is not None]
    print(n, time.time() - start, len(successful_completions))
 def get_utc_datetime():
    import datetime as dt
    from datetime import datetime
    if hasattr(dt, "UTC"):
        return datetime.now(dt.UTC)  # type: ignore
    else:
        return datetime.utcnow()  # type: ignore
 # Run the event loop to execute the async function
 async def parent_fn():
  for _ in range(10):
    dt = get_utc_datetime()
    current_minute = dt.strftime("%H-%M")
    print(f"triggered new batch - {current_minute}")
    await loadtest_fn()
    await asyncio.sleep(10)
 asyncio.run(parent_fn())
 ```
 ### Extra - Setup Fake OpenAI Server 
 Let's setup a fake openai server with a RPM limit of 100.
 Let's call our file `fake_openai_server.py`. 
 ```
 # import sys, os
 # sys.path.insert(
 #     0, os.path.abspath("../")
 # )  # Adds the parent directory to the system path
 from fastapi import FastAPI, Request, status, HTTPException, Depends
 from fastapi.responses import StreamingResponse
 from fastapi.security import OAuth2PasswordBearer
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse
 from fastapi import FastAPI, Request, HTTPException, UploadFile, File
 import httpx, os, json
 from openai import AsyncOpenAI
 from typing import Optional
 from slowapi import Limiter
 from slowapi.util import get_remote_address
 from slowapi.errors import RateLimitExceeded
 from fastapi import FastAPI, Request, HTTPException
 from fastapi.responses import PlainTextResponse
 class ProxyException(Exception):
    # NOTE: DO NOT MODIFY THIS
    # This is used to map exactly to OPENAI Exceptions
    def __init__(
        self,
        message: str,
        type: str,
        param: Optional[str],
        code: Optional[int],
    ):
        self.message = message
        self.type = type
        self.param = param
        self.code = code
    def to_dict(self) -> dict:
        """Converts the ProxyException instance to a dictionary."""
        return {
            "message": self.message,
            "type": self.type,
            "param": self.param,
            "code": self.code,
        }
 limiter = Limiter(key_func=get_remote_address)
 app = FastAPI()
 app.state.limiter = limiter
@app.exception_handler(RateLimitExceeded)
 async def _rate_limit_exceeded_handler(request: Request, exc: RateLimitExceeded):
    return JSONResponse(status_code=429,
                        content={"detail": "Rate Limited!"})
 app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler)
 app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
 )
 # for completion
@app.post("/chat/completions")
@app.post("/v1/chat/completions")
@limiter.limit("100/minute")
 async def completion(request: Request):
    # raise HTTPException(status_code=429, detail="Rate Limited!")
    return {
        "id": "chatcmpl-123",
        "object": "chat.completion",
        "created": 1677652288,
        "model": None,
        "system_fingerprint": "fp_44709d6fcb",
        "choices": [{
            "index": 0,
            "message": {
            "role": "assistant",
            "content": "\n\nHello there, how may I assist you today?",
            },
            "logprobs": None,
            "finish_reason": "stop"
        }],
        "usage": {
            "prompt_tokens": 9,
            "completion_tokens": 12,
            "total_tokens": 21
        }
    }
 if __name__ == "__main__":
    import socket
    import uvicorn
    port = 8080
    while True:
        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        result = sock.connect_ex(('0.0.0.0', port))
        if result != 0:
            print(f"Port {port} is available, starting server...")
            break
        else:
            port += 1
    uvicorn.run(app, host="0.0.0.0", port=port)
 ```
 ```bash
 python3 fake_openai_server.py
 ```
--- a/docs/my-website/docs/observability/custom_callback.md
+++ b/docs/my-website/docs/observability/custom_callback.md
@ -331,49 +331,25 @@ response = litellm.completion(model="gpt-3.5-turbo", messages=messages, metadata
 ## Examples
 ### Custom Callback to track costs for Streaming + Non-Streaming
 By default, the response cost is accessible in the logging object via `kwargs["response_cost"]` on success (sync + async)
 ```python
 # Step 1. Write your custom callback function
 def track_cost_callback(
    kwargs,                 # kwargs to completion
    completion_response,    # response from completion
    start_time, end_time    # start/end time
 ):
    try:
-        # init logging config
+        response_cost = kwargs["response_cost"] # litellm calculates response cost for you
-        logging.basicConfig(
+        print("regular response_cost", response_cost)
                filename='cost.log',
                level=logging.INFO,
                format='%(asctime)s - %(message)s',
                datefmt='%Y-%m-%d %H:%M:%S'
        )
        # check if it has collected an entire stream response
        if "complete_streaming_response" in kwargs:
            # for tracking streaming cost we pass the "messages" and the output_text to litellm.completion_cost 
            completion_response=kwargs["complete_streaming_response"]
            input_text = kwargs["messages"]
            output_text = completion_response["choices"][0]["message"]["content"]
            response_cost = litellm.completion_cost(
                model = kwargs["model"],
                messages = input_text,
                completion=output_text
            )
            print("streaming response_cost", response_cost)
            logging.info(f"Model {kwargs['model']} Cost: ${response_cost:.8f}")
        # for non streaming responses
        else:
            # we pass the completion_response obj
            if kwargs["stream"] != True:
                response_cost = litellm.completion_cost(completion_response=completion_response)
                print("regular response_cost", response_cost)
                logging.info(f"Model {completion_response.model} Cost: ${response_cost:.8f}")
    except:
        pass
-# Assign the custom callback function
+# Step 2. Assign the custom callback function
 litellm.success_callback = [track_cost_callback]
 # Step 3. Make litellm.completion call
 response = completion(
    model="gpt-3.5-turbo",
    messages=[
--- a/docs/my-website/docs/observability/langfuse_integration.md
+++ b/docs/my-website/docs/observability/langfuse_integration.md
@ -121,10 +121,12 @@ response = completion(
  metadata={
      "generation_name": "ishaan-test-generation",  # set langfuse Generation Name
      "generation_id": "gen-id22",                  # set langfuse Generation ID 
      "trace_id": "trace-id22",                     # set langfuse Trace ID
      "trace_user_id": "user-id2",                  # set langfuse Trace User ID
      "session_id": "session-1",                    # set langfuse Session ID
      "tags": ["tag1", "tag2"]                      # set langfuse Tags
      "trace_id": "trace-id22",                     # set langfuse Trace ID
      ### OR ### 
      "existing_trace_id": "trace-id22",                     # if generation is continuation of past trace. This prevents default behaviour of setting a trace name
  },
 )
@ -167,6 +169,9 @@ messages = [
 chat(messages)
 ```
 ## Redacting Messages, Response Content from Langfuse Logging 
 Set `litellm.turn_off_message_logging=True` This will prevent the messages and responses from being logged to langfuse, but request metadata will still be logged.
 ## Troubleshooting & Errors
 ### Data not getting logged to Langfuse ? 
--- a/docs/my-website/docs/observability/openmeter.md
+++ b/docs/my-website/docs/observability/openmeter.md
@ -0,0 +1,97 @@
 import Image from '@theme/IdealImage';
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
 # OpenMeter - Usage-Based Billing
 [OpenMeter](https://openmeter.io/) is an Open Source Usage-Based Billing solution for AI/Cloud applications. It integrates with Stripe for easy billing.
 <Image img={require('../../img/openmeter.png')} />
 :::info
 We want to learn how we can make the callbacks better! Meet the LiteLLM [founders](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version) or
 join our [discord](https://discord.gg/wuPM9dRgDw)
 ::: 
 ## Quick Start
 Use just 2 lines of code, to instantly log your responses **across all providers** with OpenMeter
 Get your OpenMeter API Key from https://openmeter.cloud/meters
 ```python
 litellm.success_callback = ["openmeter"] # logs cost + usage of successful calls to openmeter
 ```
 <Tabs>
 <TabItem value="sdk" label="SDK">
 ```python
 # pip install langfuse 
 import litellm
 import os
 # from https://openmeter.cloud
 os.environ["OPENMETER_API_ENDPOINT"] = ""
 os.environ["OPENMETER_API_KEY"] = ""
 # LLM API Keys
 os.environ['OPENAI_API_KEY']=""
 # set langfuse as a callback, litellm will send the data to langfuse
 litellm.success_callback = ["openmeter"] 
 # openai call
 response = litellm.completion(
  model="gpt-3.5-turbo",
  messages=[
    {"role": "user", "content": "Hi 👋 - i'm openai"}
  ]
 )
 ```
 </TabItem>
 <TabItem value="proxy" label="PROXY">
 1. Add to Config.yaml
 ```yaml
 model_list:
 - litellm_params:
    api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/
    api_key: my-fake-key
    model: openai/my-fake-model
  model_name: fake-openai-endpoint
 litellm_settings:
  success_callback: ["openmeter"] # 👈 KEY CHANGE
 ```
 2. Start Proxy
 ```
 litellm --config /path/to/config.yaml
 ```
 3. Test it! 
 ```bash
 curl --location 'http://0.0.0.0:4000/chat/completions' \
 --header 'Content-Type: application/json' \
 --data ' {
      "model": "fake-openai-endpoint",
      "messages": [
        {
          "role": "user",
          "content": "what llm are you"
        }
      ],
    }
 '
 ```
 </TabItem>
 </Tabs>
 <Image img={require('../../img/openmeter_img_2.png')} />
--- a/docs/my-website/docs/observability/sentry.md
+++ b/docs/my-website/docs/observability/sentry.md
@ -40,5 +40,9 @@ response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content
 print(response)
 ```
 ## Redacting Messages, Response Content from Sentry Logging 
 Set `litellm.turn_off_message_logging=True` This will prevent the messages and responses from being logged to sentry, but request metadata will still be logged.
 [Let us know](https://github.com/BerriAI/litellm/issues/new?assignees=&labels=enhancement&projects=&template=feature_request.yml&title=%5BFeature%5D%3A+) if you need any additional options from Sentry. 
--- a/docs/my-website/docs/providers/mistral.md
+++ b/docs/my-website/docs/providers/mistral.md
@ -53,6 +53,50 @@ All models listed here https://docs.mistral.ai/platform/endpoints are supported.
 | open-mixtral-8x22b | `completion(model="mistral/open-mixtral-8x22b", messages)` | 
 ## Function Calling 
 ```python
 from litellm import completion
 # set env
 os.environ["MISTRAL_API_KEY"] = "your-api-key"
 tools = [
    {
        "type": "function",
        "function": {
            "name": "get_current_weather",
            "description": "Get the current weather in a given location",
            "parameters": {
                "type": "object",
                "properties": {
                    "location": {
                        "type": "string",
                        "description": "The city and state, e.g. San Francisco, CA",
                    },
                    "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
                },
                "required": ["location"],
            },
        },
    }
 ]
 messages = [{"role": "user", "content": "What's the weather like in Boston today?"}]
 response = completion(
    model="mistral/mistral-large-latest",
    messages=messages,
    tools=tools,
    tool_choice="auto",
 )
 # Add any assertions, here to check response args
 print(response)
 assert isinstance(response.choices[0].message.tool_calls[0].function.name, str)
 assert isinstance(
    response.choices[0].message.tool_calls[0].function.arguments, str
 )
 ```
 ## Sample Usage - Embedding
 ```python
 from litellm import embedding
--- a/docs/my-website/docs/providers/vllm.md
+++ b/docs/my-website/docs/providers/vllm.md
@ -4,6 +4,13 @@ LiteLLM supports all models on VLLM.
 🚀[Code Tutorial](https://github.com/BerriAI/litellm/blob/main/cookbook/VLLM_Model_Testing.ipynb)
 :::info
 To call a HOSTED VLLM Endpoint use [these docs](./openai_compatible.md)
 :::
 ### Quick Start
 ```
 pip install litellm vllm
--- a/docs/my-website/docs/providers/watsonx.md
+++ b/docs/my-website/docs/providers/watsonx.md
@ -0,0 +1,284 @@
 import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
 # IBM watsonx.ai
 LiteLLM supports all IBM [watsonx.ai](https://watsonx.ai/) foundational models and embeddings.
 ## Environment Variables
 ```python
 os.environ["WATSONX_URL"] = ""  # (required) Base URL of your WatsonX instance
 # (required) either one of the following:
 os.environ["WATSONX_APIKEY"] = "" # IBM cloud API key
 os.environ["WATSONX_TOKEN"] = "" # IAM auth token
 # optional - can also be passed as params to completion() or embedding()
 os.environ["WATSONX_PROJECT_ID"] = "" # Project ID of your WatsonX instance
 os.environ["WATSONX_DEPLOYMENT_SPACE_ID"] = "" # ID of your deployment space to use deployed models
 ```
 See [here](https://cloud.ibm.com/apidocs/watsonx-ai#api-authentication) for more information on how to get an access token to authenticate to watsonx.ai.
 ## Usage
 <a target="_blank" href="https://colab.research.google.com/github/BerriAI/litellm/blob/main/cookbook/liteLLM_IBM_Watsonx.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
 </a>
 ```python
 import os
 from litellm import completion
 os.environ["WATSONX_URL"] = ""
 os.environ["WATSONX_APIKEY"] = ""
 response = completion(
  model="watsonx/ibm/granite-13b-chat-v2",
  messages=[{ "content": "what is your favorite colour?","role": "user"}],
  project_id="<my-project-id>" # or pass with os.environ["WATSONX_PROJECT_ID"]
 )
 response = completion(
  model="watsonx/meta-llama/llama-3-8b-instruct",
  messages=[{ "content": "what is your favorite colour?","role": "user"}],
  project_id="<my-project-id>"
 )
 ```
 ## Usage - Streaming
 ```python
 import os
 from litellm import completion
 os.environ["WATSONX_URL"] = ""
 os.environ["WATSONX_APIKEY"] = ""
 os.environ["WATSONX_PROJECT_ID"] = ""
 response = completion(
  model="watsonx/ibm/granite-13b-chat-v2",
  messages=[{ "content": "what is your favorite colour?","role": "user"}],
  stream=True
 )
 for chunk in response:
  print(chunk)
 ```
 #### Example Streaming Output Chunk
 ```json
 {
  "choices": [
    {
      "finish_reason": null,
      "index": 0,
      "delta": {
        "content": "I don't have a favorite color, but I do like the color blue. What's your favorite color?"
      }
    }
  ],
  "created": null,
  "model": "watsonx/ibm/granite-13b-chat-v2",
  "usage": {
    "prompt_tokens": null,
    "completion_tokens": null,
    "total_tokens": null
  }
 }
 ```
 ## Usage - Models in deployment spaces
 Models that have been deployed to a deployment space (e.g.: tuned models) can be called using the `deployment/<deployment_id>` format (where `<deployment_id>` is the ID of the deployed model in your deployment space). 
 The ID of your deployment space must also be set in the environment variable `WATSONX_DEPLOYMENT_SPACE_ID` or passed to the function as `space_id=<deployment_space_id>`. 
 ```python
 import litellm
 response = litellm.completion(
    model="watsonx/deployment/<deployment_id>",
    messages=[{"content": "Hello, how are you?", "role": "user"}],
    space_id="<deployment_space_id>"
 )
 ```
 ## Usage - Embeddings
 LiteLLM also supports making requests to IBM watsonx.ai embedding models. The credential needed for this is the same as for completion.
 ```python
 from litellm import embedding
 response = embedding(
    model="watsonx/ibm/slate-30m-english-rtrvr",
    input=["What is the capital of France?"],
    project_id="<my-project-id>"
 )
 print(response)
 # EmbeddingResponse(model='ibm/slate-30m-english-rtrvr', data=[{'object': 'embedding', 'index': 0, 'embedding': [-0.037463713, -0.02141933, -0.02851813, 0.015519324, ..., -0.0021367231, -0.01704561, -0.001425816, 0.0035238306]}], object='list', usage=Usage(prompt_tokens=8, total_tokens=8))
 ```
 ## OpenAI Proxy Usage 
 Here's how to call IBM watsonx.ai with the LiteLLM Proxy Server
 ### 1. Save keys in your environment
 ```bash
 export WATSONX_URL=""
 export WATSONX_APIKEY=""
 export WATSONX_PROJECT_ID=""
 ```
 ### 2. Start the proxy 
 <Tabs>
 <TabItem value="cli" label="CLI">
 ```bash
 $ litellm --model watsonx/meta-llama/llama-3-8b-instruct
 # Server running on http://0.0.0.0:4000
 ```
 </TabItem>
 <TabItem value="config" label="config.yaml">
 ```yaml
 model_list:
  - model_name: llama-3-8b
    litellm_params:
      # all params accepted by litellm.completion()
      model: watsonx/meta-llama/llama-3-8b-instruct
      api_key: "os.environ/WATSONX_API_KEY" # does os.getenv("WATSONX_API_KEY")
 ```
 </TabItem>
 </Tabs>
 ### 3. Test it
 <Tabs>
 <TabItem value="Curl" label="Curl Request">
 ```shell
 curl --location 'http://0.0.0.0:4000/chat/completions' \
 --header 'Content-Type: application/json' \
 --data ' {
      "model": "llama-3-8b",
      "messages": [
        {
          "role": "user",
          "content": "what is your favorite colour?"
        }
      ]
    }
 '
 ```
 </TabItem>
 <TabItem value="openai" label="OpenAI v1.0.0+">
 ```python
 import openai
 client = openai.OpenAI(
    api_key="anything",
    base_url="http://0.0.0.0:4000"
 )
 # request sent to model set on litellm proxy, `litellm --model`
 response = client.chat.completions.create(model="llama-3-8b", messages=[
    {
        "role": "user",
        "content": "what is your favorite colour?"
    }
 ])
 print(response)
 ```
 </TabItem>
 <TabItem value="langchain" label="Langchain">
 ```python
 from langchain.chat_models import ChatOpenAI
 from langchain.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    SystemMessagePromptTemplate,
 )
 from langchain.schema import HumanMessage, SystemMessage
 chat = ChatOpenAI(
    openai_api_base="http://0.0.0.0:4000", # set openai_api_base to the LiteLLM Proxy
    model = "llama-3-8b",
    temperature=0.1
 )
 messages = [
    SystemMessage(
        content="You are a helpful assistant that im using to make a test request to."
    ),
    HumanMessage(
        content="test from litellm. tell me why it's amazing in 1 sentence"
    ),
 ]
 response = chat(messages)
 print(response)
 ```
 </TabItem>
 </Tabs>
 ## Authentication
 ### Passing credentials as parameters
 You can also pass the credentials as parameters to the completion and embedding functions.
 ```python
 import os
 from litellm import completion
 response = completion(
            model="watsonx/ibm/granite-13b-chat-v2",
            messages=[{ "content": "What is your favorite color?","role": "user"}],
            url="",
            api_key="",
            project_id=""
 )
 ```
 ## Supported IBM watsonx.ai Models
 Here are some examples of models available in IBM watsonx.ai that you can use with LiteLLM:
 | Mode Name | Command |
 | ---------- | --------- |
 | Flan T5 XXL | `completion(model=watsonx/google/flan-t5-xxl, messages=messages)` |
 | Flan Ul2 | `completion(model=watsonx/google/flan-ul2, messages=messages)` |
 | Mt0 XXL | `completion(model=watsonx/bigscience/mt0-xxl, messages=messages)` |
 | Gpt Neox | `completion(model=watsonx/eleutherai/gpt-neox-20b, messages=messages)` |
 | Mpt 7B Instruct2 | `completion(model=watsonx/ibm/mpt-7b-instruct2, messages=messages)` |
 | Starcoder | `completion(model=watsonx/bigcode/starcoder, messages=messages)` |
 | Llama 2 70B Chat | `completion(model=watsonx/meta-llama/llama-2-70b-chat, messages=messages)` |
 | Llama 2 13B Chat | `completion(model=watsonx/meta-llama/llama-2-13b-chat, messages=messages)` |
 | Granite 13B Instruct | `completion(model=watsonx/ibm/granite-13b-instruct-v1, messages=messages)` |
 | Granite 13B Chat | `completion(model=watsonx/ibm/granite-13b-chat-v1, messages=messages)` |
 | Flan T5 XL | `completion(model=watsonx/google/flan-t5-xl, messages=messages)` |
 | Granite 13B Chat V2 | `completion(model=watsonx/ibm/granite-13b-chat-v2, messages=messages)` |
 | Granite 13B Instruct V2 | `completion(model=watsonx/ibm/granite-13b-instruct-v2, messages=messages)` |
 | Elyza Japanese Llama 2 7B Instruct | `completion(model=watsonx/elyza/elyza-japanese-llama-2-7b-instruct, messages=messages)` |
 | Mixtral 8X7B Instruct V01 Q | `completion(model=watsonx/ibm-mistralai/mixtral-8x7b-instruct-v01-q, messages=messages)` |
 For a list of all available models in watsonx.ai, see [here](https://dataplatform.cloud.ibm.com/docs/content/wsj/analyze-data/fm-models.html?context=wx&locale=en&audience=wdp).
 ## Supported IBM watsonx.ai Embedding Models
 | Model Name           | Function Call                               |
 |----------------------|---------------------------------------------|
 | Slate 30m | `embedding(model="watsonx/ibm/slate-30m-english-rtrvr", input=input)` |
 | Slate 125m  | `embedding(model="watsonx/ibm/slate-125m-english-rtrvr", input=input)` |
 For a list of all available embedding models in watsonx.ai, see [here](https://dataplatform.cloud.ibm.com/docs/content/wsj/analyze-data/fm-models-embed.html?context=wx).
--- a/docs/my-website/docs/proxy/alerting.md
+++ b/docs/my-website/docs/proxy/alerting.md
@ -1,13 +1,13 @@
-# Slack Alerting
+# 🚨 Alerting 
 Get alerts for:
- hanging LLM api calls
+- Hanging LLM api calls
- failed LLM api calls
+- Failed LLM api calls
- slow LLM api calls
+- Slow LLM api calls
- budget Tracking per key/user:
+- Budget Tracking per key/user:
    - When a User/Key crosses their Budget 
    - When a User/Key is 15% away from crossing their Budget
- failed db read/writes
+- Failed db read/writes
 ## Quick Start
--- a/docs/my-website/docs/proxy/configs.md
+++ b/docs/my-website/docs/proxy/configs.md
@ -62,9 +62,11 @@ model_list:
 litellm_settings: # module level litellm settings - https://github.com/BerriAI/litellm/blob/main/litellm/__init__.py
  drop_params: True
  success_callback: ["langfuse"] # OPTIONAL - if you want to start sending LLM Logs to Langfuse. Make sure to set `LANGFUSE_PUBLIC_KEY` and `LANGFUSE_SECRET_KEY` in your env
 general_settings: 
  master_key: sk-1234 # [OPTIONAL] Only use this if you to require all calls to contain this key (Authorization: Bearer sk-1234)
  alerting: ["slack"] # [OPTIONAL] If you want Slack Alerts for Hanging LLM requests, Slow llm responses, Budget Alerts. Make sure to set `SLACK_WEBHOOK_URL` in your env
 ```
 :::info
--- a/docs/my-website/docs/proxy/deploy.md
+++ b/docs/my-website/docs/proxy/deploy.md
@ -11,40 +11,37 @@ You can find the Dockerfile to build litellm proxy [here](https://github.com/Ber
 <TabItem value="basic" label="Basic">
-**Step 1. Create a file called `litellm_config.yaml`**
+### Step 1. CREATE config.yaml 
-  Example `litellm_config.yaml` (the `os.environ/` prefix means litellm will read `AZURE_API_BASE` from the env)
+Example `litellm_config.yaml` 
  ```yaml
  model_list:
    - model_name: azure-gpt-3.5
      litellm_params:
        model: azure/<your-azure-model-deployment>
        api_base: os.environ/AZURE_API_BASE
        api_key: os.environ/AZURE_API_KEY
        api_version: "2023-07-01-preview"
  ```
-**Step 2. Run litellm docker image**
+```yaml
 model_list:
  - model_name: azure-gpt-3.5
    litellm_params:
      model: azure/<your-azure-model-deployment>
      api_base: os.environ/AZURE_API_BASE # runs os.getenv("AZURE_API_BASE")
      api_key: os.environ/AZURE_API_KEY # runs os.getenv("AZURE_API_KEY")
      api_version: "2023-07-01-preview"
 ```
  See the latest available ghcr docker image here:
  https://github.com/berriai/litellm/pkgs/container/litellm
  Your litellm config.yaml should be called `litellm_config.yaml` in the directory you run this command. 
  The `-v` command will mount that file
-  Pass `AZURE_API_KEY` and `AZURE_API_BASE` since we set them in step 1
+### Step 2. RUN Docker Image
-  ```shell
+```shell
-  docker run \
+docker run \
-      -v $(pwd)/litellm_config.yaml:/app/config.yaml \
+    -v $(pwd)/litellm_config.yaml:/app/config.yaml \
-      -e AZURE_API_KEY=d6*********** \
+    -e AZURE_API_KEY=d6*********** \
-      -e AZURE_API_BASE=https://openai-***********/ \
+    -e AZURE_API_BASE=https://openai-***********/ \
-      -p 4000:4000 \
+    -p 4000:4000 \
-      ghcr.io/berriai/litellm:main-latest \
+    ghcr.io/berriai/litellm:main-latest \
-      --config /app/config.yaml --detailed_debug
+    --config /app/config.yaml --detailed_debug
-  ```
+```
-**Step 3. Send a Test Request**
+Get Latest Image 👉 [here](https://github.com/berriai/litellm/pkgs/container/litellm)
 ### Step 3. TEST Request
  Pass `model=azure-gpt-3.5` this was set on step 1
@ -231,13 +228,16 @@ Your OpenAI proxy server is now running on `http://127.0.0.1:4000`.
 | Docs | When to Use |
 | --- | --- |
 | [Quick Start](#quick-start) | call 100+ LLMs + Load Balancing |
-| [Deploy with Database](#deploy-with-database) | + use Virtual Keys + Track Spend |
+| [Deploy with Database](#deploy-with-database) | + use Virtual Keys + Track Spend (Note: When deploying with a database providing a `DATABASE_URL` and `LITELLM_MASTER_KEY` are required in your env ) |
 | [LiteLLM container + Redis](#litellm-container--redis) | + load balance across multiple litellm containers |
 | [LiteLLM Database container + PostgresDB + Redis](#litellm-database-container--postgresdb--redis) | + use Virtual Keys + Track Spend + load balance across multiple litellm containers |
 ## Deploy with Database
 ### Docker, Kubernetes, Helm Chart
 Requirements:
 - Need a postgres database (e.g. [Supabase](https://supabase.com/), [Neon](https://neon.tech/), etc) Set `DATABASE_URL=postgresql://<user>:<password>@<host>:<port>/<dbname>` in your env 
 - Set a `LITELLM_MASTER_KEY`, this is your Proxy Admin key - you can use this to create other keys (🚨 must start with `sk-`)
 <Tabs>
@ -252,6 +252,8 @@ docker pull ghcr.io/berriai/litellm-database:main-latest
 ```shell
 docker run \
    -v $(pwd)/litellm_config.yaml:/app/config.yaml \
    -e LITELLM_MASTER_KEY=sk-1234 \
    -e DATABASE_URL=postgresql://<user>:<password>@<host>:<port>/<dbname> \
    -e AZURE_API_KEY=d6*********** \
    -e AZURE_API_BASE=https://openai-***********/ \
    -p 4000:4000 \
@ -267,26 +269,63 @@ Your OpenAI proxy server is now running on `http://0.0.0.0:4000`.
 #### Step 1. Create deployment.yaml
 ```yaml
-   apiVersion: apps/v1
+apiVersion: apps/v1
-   kind: Deployment
+kind: Deployment
-   metadata:
+metadata:
-     name: litellm-deployment
+  name: litellm-deployment
-   spec:
+spec:
-     replicas: 1
+  replicas: 3
-     selector:
+  selector:
-       matchLabels:
+    matchLabels:
-         app: litellm
+      app: litellm
-     template:
+  template:
-       metadata:
+    metadata:
-         labels:
+      labels:
-           app: litellm
+        app: litellm
-       spec:
+    spec:
-         containers:
+      containers:
-           - name: litellm-container
+        - name: litellm-container
-             image: ghcr.io/berriai/litellm-database:main-latest
+          image: ghcr.io/berriai/litellm:main-latest
-             env:
+          imagePullPolicy: Always
-              - name: DATABASE_URL
+          env:
-                value: postgresql://<user>:<password>@<host>:<port>/<dbname>
+            - name: AZURE_API_KEY
              value: "d6******"
            - name: AZURE_API_BASE
              value: "https://ope******"
            - name: LITELLM_MASTER_KEY
              value: "sk-1234"
            - name: DATABASE_URL
              value: "po**********"
          args:
            - "--config"
            - "/app/proxy_config.yaml"  # Update the path to mount the config file
          volumeMounts:                 # Define volume mount for proxy_config.yaml
            - name: config-volume
              mountPath: /app
              readOnly: true
          livenessProbe:
            httpGet:
              path: /health/liveliness
              port: 4000
            initialDelaySeconds: 120
            periodSeconds: 15
            successThreshold: 1
            failureThreshold: 3
            timeoutSeconds: 10
          readinessProbe:
            httpGet:
              path: /health/readiness
              port: 4000
            initialDelaySeconds: 120
            periodSeconds: 15
            successThreshold: 1
            failureThreshold: 3
            timeoutSeconds: 10
      volumes:  # Define volume to mount proxy_config.yaml
        - name: config-volume
          configMap:
            name: litellm-config  
 ```
 ```bash
--- a/docs/my-website/docs/proxy/logging.md
+++ b/docs/my-website/docs/proxy/logging.md
@ -10,6 +10,7 @@ Log Proxy Input, Output, Exceptions using Custom Callbacks, Langfuse, OpenTeleme
 - [Async Custom Callbacks](#custom-callback-class-async)
 - [Async Custom Callback APIs](#custom-callback-apis-async)
 - [Logging to Langfuse](#logging-proxy-inputoutput---langfuse)
 - [Logging to OpenMeter](#logging-proxy-inputoutput---langfuse)
 - [Logging to s3 Buckets](#logging-proxy-inputoutput---s3-buckets)
 - [Logging to DataDog](#logging-proxy-inputoutput---datadog)
 - [Logging to DynamoDB](#logging-proxy-inputoutput---dynamodb)
@ -401,7 +402,7 @@ litellm_settings:
 Start the LiteLLM Proxy and make a test request to verify the logs reached your callback API 
 ## Logging Proxy Input/Output - Langfuse
-We will use the `--config` to set `litellm.success_callback = ["langfuse"]` this will log all successfull LLM calls to langfuse
+We will use the `--config` to set `litellm.success_callback = ["langfuse"]` this will log all successfull LLM calls to langfuse. Make sure to set `LANGFUSE_PUBLIC_KEY` and `LANGFUSE_SECRET_KEY` in your environment
 **Step 1** Install langfuse
@ -419,7 +420,13 @@ litellm_settings:
  success_callback: ["langfuse"]
 ```
-**Step 3**: Start the proxy, make a test request
+**Step 3**: Set required env variables for logging to langfuse
 ```shell
 export LANGFUSE_PUBLIC_KEY="pk_kk"
 export LANGFUSE_SECRET_KEY="sk_ss
 ```
 **Step 4**: Start the proxy, make a test request
 Start proxy
 ```shell
@ -569,6 +576,75 @@ curl -X POST 'http://0.0.0.0:4000/key/generate' \
 All requests made with these keys will log data to their team-specific logging.
 ### Redacting Messages, Response Content from Langfuse Logging 
 Set `litellm.turn_off_message_logging=True` This will prevent the messages and responses from being logged to langfuse, but request metadata will still be logged.
 ```yaml
 model_list:
 - model_name: gpt-3.5-turbo
    litellm_params:
      model: gpt-3.5-turbo
 litellm_settings:
  success_callback: ["langfuse"]
  turn_off_message_logging: True
 ```
 ## Logging Proxy Cost + Usage - OpenMeter
 Bill customers according to their LLM API usage with [OpenMeter](../observability/openmeter.md)
 **Required Env Variables**
 ```bash
 # from https://openmeter.cloud
 export OPENMETER_API_ENDPOINT="" # defaults to https://openmeter.cloud
 export OPENMETER_API_KEY=""
 ```
 ### Quick Start 
 1. Add to Config.yaml
 ```yaml
 model_list:
 - litellm_params:
    api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/
    api_key: my-fake-key
    model: openai/my-fake-model
  model_name: fake-openai-endpoint
 litellm_settings:
  success_callback: ["openmeter"] # 👈 KEY CHANGE
 ```
 2. Start Proxy
 ```
 litellm --config /path/to/config.yaml
 ```
 3. Test it! 
 ```bash
 curl --location 'http://0.0.0.0:4000/chat/completions' \
 --header 'Content-Type: application/json' \
 --data ' {
      "model": "fake-openai-endpoint",
      "messages": [
        {
          "role": "user",
          "content": "what llm are you"
        }
      ],
    }
 '
 ```
 <Image img={require('../../img/openmeter_img_2.png')} />
 ## Logging Proxy Input/Output - DataDog
 We will use the `--config` to set `litellm.success_callback = ["datadog"]` this will log all successfull LLM calls to DataDog
--- a/docs/my-website/docs/routing.md
+++ b/docs/my-website/docs/routing.md
@ -95,7 +95,7 @@ print(response)
 - `router.image_generation()` - completion calls in OpenAI `/v1/images/generations` endpoint format
 - `router.aimage_generation()` - async image generation calls
-### Advanced - Routing Strategies
+## Advanced - Routing Strategies
 #### Routing Strategies - Weighted Pick, Rate Limit Aware, Least Busy, Latency Based
 Router provides 4 strategies for routing your calls across multiple deployments: 
@ -278,6 +278,36 @@ router_settings:
 	routing_strategy_args: {"ttl": 10}
 ```
 ### Set Lowest Latency Buffer
 Set a buffer within which deployments are candidates for making calls to. 
 E.g. 
 if you have 5 deployments
 ```
 https://litellm-prod-1.openai.azure.com/: 0.07s
 https://litellm-prod-2.openai.azure.com/: 0.1s
 https://litellm-prod-3.openai.azure.com/: 0.1s
 https://litellm-prod-4.openai.azure.com/: 0.1s
 https://litellm-prod-5.openai.azure.com/: 4.66s
 ```
 to prevent initially overloading `prod-1`, with all requests - we can set a buffer of 50%, to consider deployments `prod-2, prod-3, prod-4`. 
 **In Router**
 ```python 
 router = Router(..., routing_strategy_args={"lowest_latency_buffer": 0.5})
 ```
 **In Proxy**
 ```yaml
 router_settings:
 	routing_strategy_args: {"lowest_latency_buffer": 0.5}
 ```
 </TabItem>
 <TabItem value="simple-shuffle" label="(Default) Weighted Pick (Async)">
@ -443,6 +473,35 @@ asyncio.run(router_acompletion())
 ## Basic Reliability
 ### Max Parallel Requests (ASYNC)
 Used in semaphore for async requests on router. Limit the max concurrent calls made to a deployment. Useful in high-traffic scenarios. 
 If tpm/rpm is set, and no max parallel request limit given, we use the RPM or calculated RPM (tpm/1000/6) as the max parallel request limit. 
 ```python
 from litellm import Router 
 model_list = [{
 	"model_name": "gpt-4",
 	"litellm_params": {
 		"model": "azure/gpt-4",
 		...
 		"max_parallel_requests": 10 # 👈 SET PER DEPLOYMENT
 	}
 }]
 ### OR ### 
 router = Router(model_list=model_list, default_max_parallel_requests=20) # 👈 SET DEFAULT MAX PARALLEL REQUESTS 
 # deployment max parallel requests > default max parallel requests
 ```
 [**See Code**](https://github.com/BerriAI/litellm/blob/a978f2d8813c04dad34802cb95e0a0e35a3324bc/litellm/utils.py#L5605)
 ### Timeouts 
 The timeout set in router is for the entire length of the call, and is passed down to the completion() call level as well. 
--- a/docs/my-website/docs/set_keys.md
+++ b/docs/my-website/docs/set_keys.md
@ -5,6 +5,9 @@ LiteLLM allows you to specify the following:
 * API Base
 * API Version
 * API Type
 * Project
 * Location
 * Token
 Useful Helper functions: 
 * [`check_valid_key()`](#check_valid_key)
@ -43,6 +46,24 @@ os.environ['AZURE_API_TYPE'] = "azure" # [OPTIONAL]
 os.environ['OPENAI_API_BASE'] = "https://openai-gpt-4-test2-v-12.openai.azure.com/"
 ```
 ### Setting Project, Location, Token
 For cloud providers:
 - Azure
 - Bedrock
 - GCP
 - Watson AI 
 you might need to set additional parameters. LiteLLM provides a common set of params, that we map across all providers. 
 |      | LiteLLM param | Watson       | Vertex AI    | Azure        | Bedrock      |
 |------|--------------|--------------|--------------|--------------|--------------|
 | Project | project | watsonx_project | vertex_project | n/a | n/a |
 | Region | region_name | watsonx_region_name | vertex_location | n/a | aws_region_name |
 | Token | token | watsonx_token or token | n/a | azure_ad_token | n/a |
 If you want, you can call them by their provider-specific params as well. 
 ## litellm variables
 ### litellm.api_key
--- a/docs/my-website/img/openmeter.png
+++ b/docs/my-website/img/openmeter.png
--- a/docs/my-website/img/openmeter_img_2.png
+++ b/docs/my-website/img/openmeter_img_2.png
--- a/docs/my-website/sidebars.js
+++ b/docs/my-website/sidebars.js
@ -43,6 +43,12 @@ const sidebars = {
        "proxy/user_keys",
        "proxy/enterprise",
        "proxy/virtual_keys",
        "proxy/alerting",
        {
          type: "category",
          label: "Logging",
          items: ["proxy/logging", "proxy/streaming_logging"],
        },
        "proxy/team_based_routing",
        "proxy/ui",
        "proxy/cost_tracking",
@ -58,11 +64,6 @@ const sidebars = {
        "proxy/pii_masking",
        "proxy/prompt_injection",
        "proxy/caching",
        {
          type: "category",
          label: "Logging, Alerting",
          items: ["proxy/logging", "proxy/alerting", "proxy/streaming_logging"],
        },
        "proxy/prometheus",
        "proxy/call_hooks",
        "proxy/rules",
@ -148,6 +149,7 @@ const sidebars = {
        "providers/openrouter", 
        "providers/custom_openai_proxy",
        "providers/petals",
        "providers/watsonx",
      ],
    },
    "proxy/custom_pricing",
@ -168,6 +170,7 @@ const sidebars = {
        "observability/custom_callback",
        "observability/langfuse_integration",
        "observability/sentry",
        "observability/openmeter",
        "observability/promptlayer_integration",
        "observability/wandb_integration",
        "observability/langsmith_integration",
@ -175,7 +178,6 @@ const sidebars = {
        "observability/traceloop_integration",
        "observability/athina_integration",
        "observability/lunary_integration",
        "observability/athina_integration",
        "observability/helicone_integration",
        "observability/supabase_integration",
        `observability/telemetry`,
--- a/litellm-js/spend-logs/package-lock.json
+++ b/litellm-js/spend-logs/package-lock.json
@ -6,7 +6,7 @@
    "": {
      "dependencies": {
        "@hono/node-server": "^1.9.0",
-        "hono": "^4.1.5"
+        "hono": "^4.2.7"
      },
      "devDependencies": {
        "@types/node": "^20.11.17",
@ -463,9 +463,9 @@
      }
    },
    "node_modules/hono": {
-      "version": "4.1.5",
+      "version": "4.2.7",
-      "resolved": "https://registry.npmjs.org/hono/-/hono-4.1.5.tgz",
+      "resolved": "https://registry.npmjs.org/hono/-/hono-4.2.7.tgz",
-      "integrity": "sha512-3ChJiIoeCxvkt6vnkxJagplrt1YZg3NyNob7ssVeK2PUqEINp4q1F94HzFnvY9QE8asVmbW5kkTDlyWylfg2vg==",
+      "integrity": "sha512-k1xHi86tJnRIVvqhFMBDGFKJ8r5O+bEsT4P59ZK59r0F300Xd910/r237inVfuT/VmE86RQQffX4OYNda6dLXw==",
      "engines": {
        "node": ">=16.0.0"
      }
--- a/litellm-js/spend-logs/package.json
+++ b/litellm-js/spend-logs/package.json
@ -4,7 +4,7 @@
  },
  "dependencies": {
    "@hono/node-server": "^1.9.0",
-    "hono": "^4.1.5"
+    "hono": "^4.2.7"
  },
  "devDependencies": {
    "@types/node": "^20.11.17",
--- a/litellm/init.py
+++ b/litellm/init.py
@ -2,7 +2,7 @@
 import threading, requests, os
 from typing import Callable, List, Optional, Dict, Union, Any, Literal
 from litellm.caching import Cache
-from litellm._logging import set_verbose, _turn_on_debug, verbose_logger
+from litellm._logging import set_verbose, _turn_on_debug, verbose_logger, json_logs
 from litellm.proxy._types import (
    KeyManagementSystem,
    KeyManagementSettings,
@ -22,6 +22,7 @@ success_callback: List[Union[str, Callable]] = []
 failure_callback: List[Union[str, Callable]] = []
 service_callback: List[Union[str, Callable]] = []
 callbacks: List[Callable] = []
 _custom_logger_compatible_callbacks: list = ["openmeter"]
 _langfuse_default_tags: Optional[
    List[
        Literal[
@ -45,6 +46,7 @@ _async_failure_callback: List[Callable] = (
 )  # internal variable - async custom callbacks are routed here.
 pre_call_rules: List[Callable] = []
 post_call_rules: List[Callable] = []
 turn_off_message_logging: Optional[bool] = False
 ## end of callbacks #############
 email: Optional[str] = (
@ -58,6 +60,7 @@ max_tokens = 256  # OpenAI Defaults
 drop_params = False
 modify_params = False
 retry = True
 ### AUTH ###
 api_key: Optional[str] = None
 openai_key: Optional[str] = None
 azure_key: Optional[str] = None
@ -76,7 +79,12 @@ cloudflare_api_key: Optional[str] = None
 baseten_key: Optional[str] = None
 aleph_alpha_key: Optional[str] = None
 nlp_cloud_key: Optional[str] = None
 common_cloud_provider_auth_params: dict = {
    "params": ["project", "region_name", "token"],
    "providers": ["vertex_ai", "bedrock", "watsonx", "azure"],
 }
 use_client: bool = False
 ssl_verify: bool = True
 disable_streaming_logging: bool = False
 ### GUARDRAILS ###
 llamaguard_model_name: Optional[str] = None
@ -298,6 +306,7 @@ aleph_alpha_models: List = []
 bedrock_models: List = []
 deepinfra_models: List = []
 perplexity_models: List = []
 watsonx_models: List = []
 for key, value in model_cost.items():
    if value.get("litellm_provider") == "openai":
        open_ai_chat_completion_models.append(key)
@ -342,6 +351,8 @@ for key, value in model_cost.items():
        deepinfra_models.append(key)
    elif value.get("litellm_provider") == "perplexity":
        perplexity_models.append(key)
    elif value.get("litellm_provider") == "watsonx":
        watsonx_models.append(key)
 # known openai compatible endpoints - we'll eventually move this list to the model_prices_and_context_window.json dictionary
 openai_compatible_endpoints: List = [
@ -478,6 +489,7 @@ model_list = (
    + perplexity_models
    + maritalk_models
    + vertex_language_models
    + watsonx_models
 )
 provider_list: List = [
@ -516,6 +528,7 @@ provider_list: List = [
    "cloudflare",
    "xinference",
    "fireworks_ai",
    "watsonx",
    "custom",  # custom apis
 ]
@ -537,6 +550,7 @@ models_by_provider: dict = {
    "deepinfra": deepinfra_models,
    "perplexity": perplexity_models,
    "maritalk": maritalk_models,
    "watsonx": watsonx_models,
 }
 # mapping for those models which have larger equivalents
@ -647,9 +661,11 @@ from .llms.bedrock import (
    AmazonLlamaConfig,
    AmazonStabilityConfig,
    AmazonMistralConfig,
    AmazonBedrockGlobalConfig,
 )
 from .llms.openai import OpenAIConfig, OpenAITextCompletionConfig
 from .llms.azure import AzureOpenAIConfig, AzureOpenAIError
 from .llms.watsonx import IBMWatsonXAIConfig
 from .main import *  # type: ignore
 from .integrations import *
 from .exceptions import (
--- a/litellm/_logging.py
+++ b/litellm/_logging.py
@ -1,7 +1,7 @@
 import logging
 set_verbose = False
-
+json_logs = False
 # Create a handler for the logger (you may need to adapt this based on your needs)
 handler = logging.StreamHandler()
 handler.setLevel(logging.DEBUG)
--- a/litellm/integrations/langfuse.py
+++ b/litellm/integrations/langfuse.py
@ -12,9 +12,12 @@ import litellm
 class LangFuseLogger:
    # Class variables or attributes
-    def __init__(self, langfuse_public_key=None, langfuse_secret=None):
+    def __init__(
        self, langfuse_public_key=None, langfuse_secret=None, flush_interval=1
    ):
        try:
            from langfuse import Langfuse
            import langfuse
        except Exception as e:
            raise Exception(
                f"\033[91mLangfuse not installed, try running 'pip install langfuse' to fix this error: {e}\n{traceback.format_exc()}\033[0m"
@ -25,14 +28,20 @@ class LangFuseLogger:
        self.langfuse_host = os.getenv("LANGFUSE_HOST", "https://cloud.langfuse.com")
        self.langfuse_release = os.getenv("LANGFUSE_RELEASE")
        self.langfuse_debug = os.getenv("LANGFUSE_DEBUG")
-        self.Langfuse = Langfuse(
+
-            public_key=self.public_key,
+        parameters = {
-            secret_key=self.secret_key,
+            "public_key": self.public_key,
-            host=self.langfuse_host,
+            "secret_key": self.secret_key,
-            release=self.langfuse_release,
+            "host": self.langfuse_host,
-            debug=self.langfuse_debug,
+            "release": self.langfuse_release,
-            flush_interval=1,  # flush interval in seconds
+            "debug": self.langfuse_debug,
-        )
+            "flush_interval": flush_interval,  # flush interval in seconds
        }
        if Version(langfuse.version.__version__) >= Version("2.6.0"):
            parameters["sdk_integration"] = "litellm"
        self.Langfuse = Langfuse(**parameters)
        # set the current langfuse project id in the environ
        # this is used by Alerting to link to the correct project
@ -77,13 +86,14 @@ class LangFuseLogger:
        print_verbose,
        level="DEFAULT",
        status_message=None,
-    ):
+    ) -> dict:
        # Method definition
        try:
            print_verbose(
                f"Langfuse Logging - Enters logging function for model {kwargs}"
            )
            litellm_params = kwargs.get("litellm_params", {})
            metadata = (
                litellm_params.get("metadata", {}) or {}
@ -137,8 +147,10 @@ class LangFuseLogger:
                input = prompt
                output = response_obj["data"]
            print_verbose(f"OUTPUT IN LANGFUSE: {output}; original: {response_obj}")
            trace_id = None
            generation_id = None
            if self._is_langfuse_v2():
-                self._log_langfuse_v2(
+                trace_id, generation_id = self._log_langfuse_v2(
                    user_id,
                    metadata,
                    litellm_params,
@ -168,10 +180,12 @@ class LangFuseLogger:
                f"Langfuse Layer Logging - final response object: {response_obj}"
            )
            verbose_logger.info(f"Langfuse Layer Logging - logging success")
            return {"trace_id": trace_id, "generation_id": generation_id}
        except:
            traceback.print_exc()
            verbose_logger.debug(f"Langfuse Layer Error - {traceback.format_exc()}")
-            pass
+            return {"trace_id": None, "generation_id": None}
    async def _async_log_event(
        self, kwargs, response_obj, start_time, end_time, user_id, print_verbose
@ -243,7 +257,7 @@ class LangFuseLogger:
        response_obj,
        level,
        print_verbose,
-    ):
+    ) -> tuple:
        import langfuse
        try:
@ -262,22 +276,28 @@ class LangFuseLogger:
                tags = metadata_tags
            trace_name = metadata.get("trace_name", None)
-            if trace_name is None:
+            trace_id = metadata.get("trace_id", None)
            existing_trace_id = metadata.get("existing_trace_id", None)
            if trace_name is None and existing_trace_id is None:
                # just log `litellm-{call_type}` as the trace name
                ## DO NOT SET TRACE_NAME if trace-id set. this can lead to overwriting of past traces.
                trace_name = f"litellm-{kwargs.get('call_type', 'completion')}"
-            trace_params = {
+            if existing_trace_id is not None:
-                "name": trace_name,
+                trace_params = {"id": existing_trace_id}
-                "input": input,
+            else:  # don't overwrite an existing trace
-                "user_id": metadata.get("trace_user_id", user_id),
+                trace_params = {
-                "id": metadata.get("trace_id", None),
+                    "name": trace_name,
-                "session_id": metadata.get("session_id", None),
+                    "input": input,
-            }
+                    "user_id": metadata.get("trace_user_id", user_id),
                    "id": trace_id,
                    "session_id": metadata.get("session_id", None),
                }
-            if level == "ERROR":
+                if level == "ERROR":
-                trace_params["status_message"] = output
+                    trace_params["status_message"] = output
-            else:
+                else:
-                trace_params["output"] = output
+                    trace_params["output"] = output
            cost = kwargs.get("response_cost", None)
            print_verbose(f"trace: {cost}")
@ -335,7 +355,8 @@ class LangFuseLogger:
                        kwargs["cache_hit"] = False
                    tags.append(f"cache_hit:{kwargs['cache_hit']}")
                    clean_metadata["cache_hit"] = kwargs["cache_hit"]
-                trace_params.update({"tags": tags})
+                if existing_trace_id is None:
                    trace_params.update({"tags": tags})
            proxy_server_request = litellm_params.get("proxy_server_request", None)
            if proxy_server_request:
@ -355,8 +376,6 @@ class LangFuseLogger:
                    "headers": clean_headers,
                }
            print_verbose(f"trace_params: {trace_params}")
            trace = self.Langfuse.trace(**trace_params)
            generation_id = None
@ -373,7 +392,11 @@ class LangFuseLogger:
                # just log `litellm-{call_type}` as the generation name
                generation_name = f"litellm-{kwargs.get('call_type', 'completion')}"
-            system_fingerprint = response_obj.get("system_fingerprint", None)
+            if response_obj is not None and "system_fingerprint" in response_obj:
                system_fingerprint = response_obj.get("system_fingerprint", None)
            else:
                system_fingerprint = None
            if system_fingerprint is not None:
                optional_params["system_fingerprint"] = system_fingerprint
@ -402,8 +425,9 @@ class LangFuseLogger:
                    "completion_start_time", None
                )
-            print_verbose(f"generation_params: {generation_params}")
+            generation_client = trace.generation(**generation_params)
-
+            
-            trace.generation(**generation_params)
+            return generation_client.trace_id, generation_id
        except Exception as e:
            verbose_logger.debug(f"Langfuse Layer Error - {traceback.format_exc()}")
            return None, None
--- a/litellm/integrations/langsmith.py
+++ b/litellm/integrations/langsmith.py
@ -73,10 +73,6 @@ class LangsmithLogger:
                elif type(value) != dict and is_serializable(value=value):
                    new_kwargs[key] = value
            print(f"type of response: {type(response_obj)}")
            for k, v in new_kwargs.items():
                print(f"key={k}, type of arg: {type(v)}, value={v}")
            if isinstance(response_obj, BaseModel):
                try:
                    response_obj = response_obj.model_dump()
--- a/litellm/integrations/openmeter.py
+++ b/litellm/integrations/openmeter.py
@ -0,0 +1,123 @@
 # What is this?
 ## On Success events log cost to OpenMeter - https://github.com/BerriAI/litellm/issues/1268
 import dotenv, os, json
 import requests
 import litellm
 dotenv.load_dotenv()  # Loading env variables using dotenv
 import traceback
 from litellm.integrations.custom_logger import CustomLogger
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
 import uuid
 def get_utc_datetime():
    import datetime as dt
    from datetime import datetime
    if hasattr(dt, "UTC"):
        return datetime.now(dt.UTC)  # type: ignore
    else:
        return datetime.utcnow()  # type: ignore
 class OpenMeterLogger(CustomLogger):
    def __init__(self) -> None:
        super().__init__()
        self.validate_environment()
        self.async_http_handler = AsyncHTTPHandler()
        self.sync_http_handler = HTTPHandler()
    def validate_environment(self):
        """
        Expects
        OPENMETER_API_ENDPOINT,
        OPENMETER_API_KEY,
        in the environment
        """
        missing_keys = []
        if litellm.get_secret("OPENMETER_API_KEY", None) is None:
            missing_keys.append("OPENMETER_API_KEY")
        if len(missing_keys) > 0:
            raise Exception("Missing keys={} in environment.".format(missing_keys))
    def _common_logic(self, kwargs: dict, response_obj):
        call_id = response_obj.get("id", kwargs.get("litellm_call_id"))
        dt = get_utc_datetime().isoformat()
        cost = kwargs.get("response_cost", None)
        model = kwargs.get("model")
        usage = {}
        if (
            isinstance(response_obj, litellm.ModelResponse)
            or isinstance(response_obj, litellm.EmbeddingResponse)
        ) and hasattr(response_obj, "usage"):
            usage = {
                "prompt_tokens": response_obj["usage"].get("prompt_tokens", 0),
                "completion_tokens": response_obj["usage"].get("completion_tokens", 0),
                "total_tokens": response_obj["usage"].get("total_tokens"),
            }
        return {
            "specversion": "1.0",
            "type": os.getenv("OPENMETER_EVENT_TYPE", "litellm_tokens"),
            "id": call_id,
            "time": dt,
            "subject": kwargs.get("user", ""),  # end-user passed in via 'user' param
            "source": "litellm-proxy",
            "data": {"model": model, "cost": cost, **usage},
        }
    def log_success_event(self, kwargs, response_obj, start_time, end_time):
        _url = litellm.get_secret(
            "OPENMETER_API_ENDPOINT", default_value="https://openmeter.cloud"
        )
        if _url.endswith("/"):
            _url += "api/v1/events"
        else:
            _url += "/api/v1/events"
        api_key = litellm.get_secret("OPENMETER_API_KEY")
        _data = self._common_logic(kwargs=kwargs, response_obj=response_obj)
        self.sync_http_handler.post(
            url=_url,
            data=_data,
            headers={
                "Content-Type": "application/cloudevents+json",
                "Authorization": "Bearer {}".format(api_key),
            },
        )
    async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
        _url = litellm.get_secret(
            "OPENMETER_API_ENDPOINT", default_value="https://openmeter.cloud"
        )
        if _url.endswith("/"):
            _url += "api/v1/events"
        else:
            _url += "/api/v1/events"
        api_key = litellm.get_secret("OPENMETER_API_KEY")
        _data = self._common_logic(kwargs=kwargs, response_obj=response_obj)
        _headers = {
            "Content-Type": "application/cloudevents+json",
            "Authorization": "Bearer {}".format(api_key),
        }
        try:
            response = await self.async_http_handler.post(
                url=_url,
                data=json.dumps(_data),
                headers=_headers,
            )
            response.raise_for_status()
        except Exception as e:
            print(f"\nAn Exception Occurred - {str(e)}")
            if hasattr(response, "text"):
                print(f"\nError Message: {response.text}")
            raise e
--- a/litellm/integrations/slack_alerting.py
+++ b/litellm/integrations/slack_alerting.py
@ -7,11 +7,12 @@ import copy
 import traceback
 from litellm._logging import verbose_logger, verbose_proxy_logger
 import litellm
-from typing import List, Literal, Any, Union, Optional
+from typing import List, Literal, Any, Union, Optional, Dict
 from litellm.caching import DualCache
 import asyncio
 import aiohttp
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
 import datetime
 class SlackAlerting:
@ -37,12 +38,28 @@ class SlackAlerting:
            "budget_alerts",
            "db_exceptions",
        ],
        alert_to_webhook_url: Optional[
            Dict
        ] = None,  # if user wants to separate alerts to diff channels
    ):
        self.alerting_threshold = alerting_threshold
        self.alerting = alerting
        self.alert_types = alert_types
        self.internal_usage_cache = DualCache()
        self.async_http_handler = AsyncHTTPHandler()
        self.alert_to_webhook_url = alert_to_webhook_url
        self.langfuse_logger = None
        try:
            from litellm.integrations.langfuse import LangFuseLogger
            self.langfuse_logger = LangFuseLogger(
                os.getenv("LANGFUSE_PUBLIC_KEY"),
                os.getenv("LANGFUSE_SECRET_KEY"),
                flush_interval=1,
            )
        except:
            pass
        pass
@ -51,6 +68,7 @@ class SlackAlerting:
        alerting: Optional[List] = None,
        alerting_threshold: Optional[float] = None,
        alert_types: Optional[List] = None,
        alert_to_webhook_url: Optional[Dict] = None,
    ):
        if alerting is not None:
            self.alerting = alerting
@ -59,6 +77,13 @@ class SlackAlerting:
        if alert_types is not None:
            self.alert_types = alert_types
        if alert_to_webhook_url is not None:
            # update the dict
            if self.alert_to_webhook_url is None:
                self.alert_to_webhook_url = alert_to_webhook_url
            else:
                self.alert_to_webhook_url.update(alert_to_webhook_url)
    async def deployment_in_cooldown(self):
        pass
@ -81,39 +106,68 @@ class SlackAlerting:
        request_info: str,
        request_data: Optional[dict] = None,
        kwargs: Optional[dict] = None,
        type: Literal["hanging_request", "slow_response"] = "hanging_request",
        start_time: Optional[datetime.datetime] = None,
        end_time: Optional[datetime.datetime] = None,
    ):
        import uuid
        # For now: do nothing as we're debugging why this is not working as expected
        if request_data is not None:
            trace_id = request_data.get("metadata", {}).get(
                "trace_id", None
            )  # get langfuse trace id
            if trace_id is None:
                trace_id = "litellm-alert-trace-" + str(uuid.uuid4())
                request_data["metadata"]["trace_id"] = trace_id
        elif kwargs is not None:
            _litellm_params = kwargs.get("litellm_params", {})
            trace_id = _litellm_params.get("metadata", {}).get(
                "trace_id", None
            )  # get langfuse trace id
            if trace_id is None:
                trace_id = "litellm-alert-trace-" + str(uuid.uuid4())
                _litellm_params["metadata"]["trace_id"] = trace_id
        # Log hanging request as an error on langfuse
        if type == "hanging_request":
            if self.langfuse_logger is not None:
                _logging_kwargs = copy.deepcopy(request_data)
                if _logging_kwargs is None:
                    _logging_kwargs = {}
                _logging_kwargs["litellm_params"] = {}
                request_data = request_data or {}
                _logging_kwargs["litellm_params"]["metadata"] = request_data.get(
                    "metadata", {}
                )
                # log to langfuse in a separate thread
                import threading
                threading.Thread(
                    target=self.langfuse_logger.log_event,
                    args=(
                        _logging_kwargs,
                        None,
                        start_time,
                        end_time,
                        None,
                        print,
                        "ERROR",
                        "Requests is hanging",
                    ),
                ).start()
        _langfuse_host = os.environ.get("LANGFUSE_HOST", "https://cloud.langfuse.com")
        _langfuse_project_id = os.environ.get("LANGFUSE_PROJECT_ID")
        # langfuse urls look like: https://us.cloud.langfuse.com/project/************/traces/litellm-alert-trace-ididi9dk-09292-************
        _langfuse_url = (
            f"{_langfuse_host}/project/{_langfuse_project_id}/traces/{trace_id}"
        )
        request_info += f"\n🪢 Langfuse Trace: {_langfuse_url}"
        return request_info
        # if request_data is not None:
        #     trace_id = request_data.get("metadata", {}).get(
        #         "trace_id", None
        #     )  # get langfuse trace id
        #     if trace_id is None:
        #         trace_id = "litellm-alert-trace-" + str(uuid.uuid4())
        #         request_data["metadata"]["trace_id"] = trace_id
        # elif kwargs is not None:
        #     _litellm_params = kwargs.get("litellm_params", {})
        #     trace_id = _litellm_params.get("metadata", {}).get(
        #         "trace_id", None
        #     )  # get langfuse trace id
        #     if trace_id is None:
        #         trace_id = "litellm-alert-trace-" + str(uuid.uuid4())
        #         _litellm_params["metadata"]["trace_id"] = trace_id
        # _langfuse_host = os.environ.get("LANGFUSE_HOST", "https://cloud.langfuse.com")
        # _langfuse_project_id = os.environ.get("LANGFUSE_PROJECT_ID")
        # # langfuse urls look like: https://us.cloud.langfuse.com/project/************/traces/litellm-alert-trace-ididi9dk-09292-************
        # _langfuse_url = (
        #     f"{_langfuse_host}/project/{_langfuse_project_id}/traces/{trace_id}"
        # )
        # request_info += f"\n🪢 Langfuse Trace: {_langfuse_url}"
        # return request_info
    def _response_taking_too_long_callback(
        self,
        kwargs,  # kwargs to completion
@ -140,7 +194,6 @@ class SlackAlerting:
            raise e
    def _get_deployment_latencies_to_alert(self, metadata=None):
        if metadata is None:
            return None
@ -156,6 +209,14 @@ class SlackAlerting:
            _deployment_latencies = metadata["_latency_per_deployment"]
            if len(_deployment_latencies) == 0:
                return None
            try:
                # try sorting deployments by latency
                _deployment_latencies = sorted(
                    _deployment_latencies.items(), key=lambda x: x[1]
                )
                _deployment_latencies = dict(_deployment_latencies)
            except:
                pass
            for api_base, latency in _deployment_latencies.items():
                _message_to_send += f"\n{api_base}: {round(latency,2)}s"
            _message_to_send = "```" + _message_to_send + "```"
@ -171,8 +232,6 @@ class SlackAlerting:
        if self.alerting is None or self.alert_types is None:
            return
        if "llm_too_slow" not in self.alert_types:
            return
        time_difference_float, model, api_base, messages = (
            self._response_taking_too_long_callback(
                kwargs=kwargs,
@ -185,7 +244,7 @@ class SlackAlerting:
        if time_difference_float > self.alerting_threshold:
            if "langfuse" in litellm.success_callback:
                request_info = self._add_langfuse_trace_id_to_alert(
-                    request_info=request_info, kwargs=kwargs
+                    request_info=request_info, kwargs=kwargs, type="slow_response"
                )
            # add deployment latencies to alert
            if (
@ -205,6 +264,7 @@ class SlackAlerting:
            await self.send_alert(
                message=slow_message + request_info,
                level="Low",
                alert_type="llm_too_slow",
            )
    async def log_failure_event(self, original_exception: Exception):
@ -212,8 +272,8 @@ class SlackAlerting:
    async def response_taking_too_long(
        self,
-        start_time: Optional[float] = None,
+        start_time: Optional[datetime.datetime] = None,
-        end_time: Optional[float] = None,
+        end_time: Optional[datetime.datetime] = None,
        type: Literal["hanging_request", "slow_response"] = "hanging_request",
        request_data: Optional[dict] = None,
    ):
@ -233,17 +293,10 @@ class SlackAlerting:
            except:
                messages = ""
            request_info = f"\nRequest Model: `{model}`\nMessages: `{messages}`"
            if "langfuse" in litellm.success_callback:
                request_info = self._add_langfuse_trace_id_to_alert(
                    request_info=request_info, request_data=request_data
                )
        else:
            request_info = ""
        if type == "hanging_request":
            # Simulate a long-running operation that could take more than 5 minutes
            if "llm_requests_hanging" not in self.alert_types:
                return
            await asyncio.sleep(
                self.alerting_threshold
            )  # Set it to 5 minutes - i'd imagine this might be different for streaming, non-streaming, non-completion (embedding + img) requests
@ -281,6 +334,15 @@ class SlackAlerting:
                    f"`Requests are hanging - {self.alerting_threshold}s+ request time`"
                )
                if "langfuse" in litellm.success_callback:
                    request_info = self._add_langfuse_trace_id_to_alert(
                        request_info=request_info,
                        request_data=request_data,
                        type="hanging_request",
                        start_time=start_time,
                        end_time=end_time,
                    )
                # add deployment latencies to alert
                _deployment_latency_map = self._get_deployment_latencies_to_alert(
                    metadata=request_data.get("metadata", {})
@ -291,6 +353,7 @@ class SlackAlerting:
                await self.send_alert(
                    message=alerting_message + request_info,
                    level="Medium",
                    alert_type="llm_requests_hanging",
                )
    async def budget_alerts(
@ -336,8 +399,7 @@ class SlackAlerting:
            user_info = f"\nUser ID: {user_id}\n Error {error_message}"
            message = "Failed Tracking Cost for" + user_info
            await self.send_alert(
-                message=message,
+                message=message, level="High", alert_type="budget_alerts"
                level="High",
            )
            return
        elif type == "projected_limit_exceeded" and user_info is not None:
@ -353,8 +415,7 @@ class SlackAlerting:
            """
            message = f"""\n🚨 `ProjectedLimitExceededError` 💸\n\n`Key Alias:` {user_info["key_alias"]} \n`Expected Day of Error`: {user_info["projected_exceeded_date"]} \n`Current Spend`: {user_current_spend} \n`Projected Spend at end of month`: {user_info["projected_spend"]} \n`Soft Limit`: {user_max_budget}"""
            await self.send_alert(
-                message=message,
+                message=message, level="High", alert_type="budget_alerts"
                level="High",
            )
            return
        else:
@ -382,8 +443,7 @@ class SlackAlerting:
            result = await _cache.async_get_cache(key=message)
            if result is None:
                await self.send_alert(
-                    message=message,
+                    message=message, level="High", alert_type="budget_alerts"
                    level="High",
                )
                await _cache.async_set_cache(key=message, value="SENT", ttl=2419200)
            return
@ -395,8 +455,7 @@ class SlackAlerting:
            result = await _cache.async_get_cache(key=cache_key)
            if result is None:
                await self.send_alert(
-                    message=message,
+                    message=message, level="Medium", alert_type="budget_alerts"
                    level="Medium",
                )
                await _cache.async_set_cache(key=cache_key, value="SENT", ttl=2419200)
@ -409,15 +468,25 @@ class SlackAlerting:
            result = await _cache.async_get_cache(key=message)
            if result is None:
                await self.send_alert(
-                    message=message,
+                    message=message, level="Low", alert_type="budget_alerts"
                    level="Low",
                )
                await _cache.async_set_cache(key=message, value="SENT", ttl=2419200)
            return
        return
-    async def send_alert(self, message: str, level: Literal["Low", "Medium", "High"]):
+    async def send_alert(
        self,
        message: str,
        level: Literal["Low", "Medium", "High"],
        alert_type: Literal[
            "llm_exceptions",
            "llm_too_slow",
            "llm_requests_hanging",
            "budget_alerts",
            "db_exceptions",
        ],
    ):
        """
        Alerting based on thresholds: - https://github.com/BerriAI/litellm/issues/1298
@ -432,12 +501,6 @@ class SlackAlerting:
            level: str - Low|Medium|High - if calls might fail (Medium) or are failing (High); Currently, no alerts would be 'Low'.
            message: str - what is the alert about
        """
        print(
            "inside send alert for slack, message: ",
            message,
            "self.alerting: ",
            self.alerting,
        )
        if self.alerting is None:
            return
@ -453,7 +516,15 @@ class SlackAlerting:
        if _proxy_base_url is not None:
            formatted_message += f"\n\nProxy URL: `{_proxy_base_url}`"
-        slack_webhook_url = os.getenv("SLACK_WEBHOOK_URL", None)
+        # check if we find the slack webhook url in self.alert_to_webhook_url
        if (
            self.alert_to_webhook_url is not None
            and alert_type in self.alert_to_webhook_url
        ):
            slack_webhook_url = self.alert_to_webhook_url[alert_type]
        else:
            slack_webhook_url = os.getenv("SLACK_WEBHOOK_URL", None)
        if slack_webhook_url is None:
            raise Exception("Missing SLACK_WEBHOOK_URL from environment")
        payload = {"text": formatted_message}
--- a/litellm/llms/azure.py
+++ b/litellm/llms/azure.py
@ -96,6 +96,15 @@ class AzureOpenAIConfig(OpenAIConfig):
            top_p,
        )
    def get_mapped_special_auth_params(self) -> dict:
        return {"token": "azure_ad_token"}
    def map_special_auth_params(self, non_default_params: dict, optional_params: dict):
        for param, value in non_default_params.items():
            if param == "token":
                optional_params["azure_ad_token"] = value
        return optional_params
 def select_azure_base_url_or_endpoint(azure_client_params: dict):
    # azure_client_params = {
--- a/litellm/llms/bedrock.py
+++ b/litellm/llms/bedrock.py
@ -29,6 +29,24 @@ class BedrockError(Exception):
        )  # Call the base class constructor with the parameters it needs
 class AmazonBedrockGlobalConfig:
    def __init__(self):
        pass
    def get_mapped_special_auth_params(self) -> dict:
        """
        Mapping of common auth params across bedrock/vertex/azure/watsonx
        """
        return {"region_name": "aws_region_name"}
    def map_special_auth_params(self, non_default_params: dict, optional_params: dict):
        mapped_params = self.get_mapped_special_auth_params()
        for param, value in non_default_params.items():
            if param in mapped_params:
                optional_params[mapped_params[param]] = value
        return optional_params
 class AmazonTitanConfig:
    """
    Reference: https://us-west-2.console.aws.amazon.com/bedrock/home?region=us-west-2#/providers?model=titan-text-express-v1
@ -666,6 +684,10 @@ def convert_messages_to_prompt(model, messages, provider, custom_prompt_dict):
        prompt = prompt_factory(
            model=model, messages=messages, custom_llm_provider="bedrock"
        )
    elif provider == "meta":
        prompt = prompt_factory(
            model=model, messages=messages, custom_llm_provider="bedrock"
        )
    else:
        prompt = ""
        for message in messages:
@ -945,7 +967,7 @@ def completion(
            original_response=json.dumps(response_body),
            additional_args={"complete_input_dict": data},
        )
-        print_verbose(f"raw model_response: {response}")
+        print_verbose(f"raw model_response: {response_body}")
        ## RESPONSE OBJECT
        outputText = "default"
        if provider == "ai21":
@ -1058,6 +1080,7 @@ def completion(
            outputText = response_body.get("results")[0].get("outputText")
        response_metadata = response.get("ResponseMetadata", {})
        if response_metadata.get("HTTPStatusCode", 500) >= 400:
            raise BedrockError(
                message=outputText,
@ -1093,11 +1116,13 @@ def completion(
            prompt_tokens = response_metadata.get(
                "x-amzn-bedrock-input-token-count", len(encoding.encode(prompt))
            )
            _text_response = model_response["choices"][0]["message"].get("content", "")
            completion_tokens = response_metadata.get(
                "x-amzn-bedrock-output-token-count",
                len(
                    encoding.encode(
-                        model_response["choices"][0]["message"].get("content", "")
+                        _text_response,
                        disallowed_special=(),
                    )
                ),
            )
--- a/litellm/llms/ollama.py
+++ b/litellm/llms/ollama.py
@ -213,12 +213,13 @@ def get_ollama_response(
    ## RESPONSE OBJECT
    model_response["choices"][0]["finish_reason"] = "stop"
    if optional_params.get("format", "") == "json":
        function_call = json.loads(response_json["response"])
        message = litellm.Message(
            content=None,
            tool_calls=[
                {
                    "id": f"call_{str(uuid.uuid4())}",
-                    "function": {"arguments": response_json["response"], "name": ""},
+                    "function": {"name": function_call["name"], "arguments": json.dumps(function_call["arguments"])},
                    "type": "function",
                }
            ],
@ -310,15 +311,13 @@ async def ollama_acompletion(url, data, model_response, encoding, logging_obj):
            ## RESPONSE OBJECT
            model_response["choices"][0]["finish_reason"] = "stop"
            if data.get("format", "") == "json":
                function_call = json.loads(response_json["response"])
                message = litellm.Message(
                    content=None,
                    tool_calls=[
                        {
                            "id": f"call_{str(uuid.uuid4())}",
-                            "function": {
+                            "function": {"name": function_call["name"], "arguments": json.dumps(function_call["arguments"])},
                                "arguments": response_json["response"],
                                "name": "",
                            },
                            "type": "function",
                        }
                    ],
--- a/litellm/llms/ollama_chat.py
+++ b/litellm/llms/ollama_chat.py
@ -285,15 +285,13 @@ def get_ollama_response(
    ## RESPONSE OBJECT
    model_response["choices"][0]["finish_reason"] = "stop"
    if data.get("format", "") == "json":
        function_call = json.loads(response_json["message"]["content"])
        message = litellm.Message(
            content=None,
            tool_calls=[
                {
                    "id": f"call_{str(uuid.uuid4())}",
-                    "function": {
+                    "function": {"name": function_call["name"], "arguments": json.dumps(function_call["arguments"])},
                        "arguments": response_json["message"]["content"],
                        "name": "",
                    },
                    "type": "function",
                }
            ],
@ -415,15 +413,13 @@ async def ollama_acompletion(
            ## RESPONSE OBJECT
            model_response["choices"][0]["finish_reason"] = "stop"
            if data.get("format", "") == "json":
                function_call = json.loads(response_json["message"]["content"])
                message = litellm.Message(
                    content=None,
                    tool_calls=[
                        {
                            "id": f"call_{str(uuid.uuid4())}",
-                            "function": {
+                            "function": {"name": function_call["name"], "arguments": json.dumps(function_call["arguments"])},
                                "arguments": response_json["message"]["content"],
                                "name": function_name or "",
                            },
                            "type": "function",
                        }
                    ],
--- a/litellm/llms/openai.py
+++ b/litellm/llms/openai.py
@ -447,6 +447,7 @@ class OpenAIChatCompletion(BaseLLM):
                )
            else:
                openai_aclient = client
            ## LOGGING
            logging_obj.pre_call(
                input=data["messages"],
--- a/litellm/llms/prompt_templates/factory.py
+++ b/litellm/llms/prompt_templates/factory.py
@ -3,8 +3,14 @@ import requests, traceback
 import json, re, xml.etree.ElementTree as ET
 from jinja2 import Template, exceptions, meta, BaseLoader
 from jinja2.sandbox import ImmutableSandboxedEnvironment
-from typing import Optional, Any
+from typing import (
-from typing import List
+    Any,
    List,
    Mapping,
    MutableMapping,
    Optional,
    Sequence,
 )
 import litellm
@ -431,6 +437,35 @@ def format_prompt_togetherai(messages, prompt_format, chat_template):
    return prompt
 ### IBM Granite
 def ibm_granite_pt(messages: list):
    """
    IBM's Granite models uses the template:
    <|system|> {system_message} <|user|> {user_message} <|assistant|> {assistant_message}
    See: https://www.ibm.com/docs/en/watsonx-as-a-service?topic=solutions-supported-foundation-models
    """
    return custom_prompt(
        messages=messages,
        role_dict={
            "system": {
                "pre_message": "<|system|>\n",
                "post_message": "\n",
            },
            "user": {
                "pre_message": "<|user|>\n",
                "post_message": "\n",
            },
            "assistant": {
                "pre_message": "<|assistant|>\n",
                "post_message": "\n",
            },
        },
    ).strip()
 ### ANTHROPIC ###
@ -1017,6 +1052,30 @@ def get_system_prompt(messages):
    return system_prompt, messages
 def convert_to_documents(
    observations: Any,
 ) -> List[MutableMapping]:
    """Converts observations into a 'document' dict"""
    documents: List[MutableMapping] = []
    if isinstance(observations, str):
        # strings are turned into a key/value pair and a key of 'output' is added.
        observations = [{"output": observations}]
    elif isinstance(observations, Mapping):
        # single mappings are transformed into a list to simplify the rest of the code.
        observations = [observations]
    elif not isinstance(observations, Sequence):
        # all other types are turned into a key/value pair within a list
        observations = [{"output": observations}]
    for doc in observations:
        if not isinstance(doc, Mapping):
            # types that aren't Mapping are turned into a key/value pair.
            doc = {"output": doc}
        documents.append(doc)
    return documents
 def convert_openai_message_to_cohere_tool_result(message):
    """
    OpenAI message with a tool result looks like:
@ -1058,7 +1117,7 @@ def convert_openai_message_to_cohere_tool_result(message):
            "parameters": {"location": "San Francisco, CA"},
            "generation_id": tool_call_id,
        },
-        "outputs": [content],
+        "outputs": convert_to_documents(content),
    }
    return cohere_tool_result
@ -1071,7 +1130,7 @@ def cohere_message_pt(messages: list):
        if message["role"] == "tool":
            tool_result = convert_openai_message_to_cohere_tool_result(message)
            tool_results.append(tool_result)
-        else:
+        elif message.get("content"):
            prompt += message["content"] + "\n\n"
    prompt = prompt.rstrip()
    return prompt, tool_results
@ -1346,12 +1405,47 @@ def prompt_factory(
                return anthropic_pt(messages=messages)
        elif "mistral." in model:
            return mistral_instruct_pt(messages=messages)
        elif "llama2" in model and "chat" in model:
            return llama_2_chat_pt(messages=messages)
        elif "llama3" in model and "instruct" in model:
            return hf_chat_template(
                model="meta-llama/Meta-Llama-3-8B-Instruct",
                messages=messages,
            )
    elif custom_llm_provider == "perplexity":
        for message in messages:
            message.pop("name", None)
        return messages
    elif custom_llm_provider == "azure_text":
        return azure_text_pt(messages=messages)
    elif custom_llm_provider == "watsonx":
        if "granite" in model and "chat" in model:
            # granite-13b-chat-v1 and granite-13b-chat-v2 use a specific prompt template
            return ibm_granite_pt(messages=messages)
        elif "ibm-mistral" in model and "instruct" in model:
            # models like ibm-mistral/mixtral-8x7b-instruct-v01-q use the mistral instruct prompt template
            return mistral_instruct_pt(messages=messages)
        elif "meta-llama/llama-3" in model and "instruct" in model:
            # https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3/
            return custom_prompt(
                role_dict={
                    "system": {
                        "pre_message": "<|start_header_id|>system<|end_header_id|>\n",
                        "post_message": "<|eot_id|>",
                    },
                    "user": {
                        "pre_message": "<|start_header_id|>user<|end_header_id|>\n",
                        "post_message": "<|eot_id|>",
                    },
                    "assistant": {
                        "pre_message": "<|start_header_id|>assistant<|end_header_id|>\n",
                        "post_message": "<|eot_id|>",
                    },
                },
                messages=messages,
                initial_prompt_value="<|begin_of_text|>",
                final_prompt_value="<|start_header_id|>assistant<|end_header_id|>\n",
            )
    try:
        if "meta-llama/llama-2" in model and "chat" in model:
            return llama_2_chat_pt(messages=messages)
@ -1359,11 +1453,8 @@ def prompt_factory(
            "meta-llama/llama-3" in model or "meta-llama-3" in model
        ) and "instruct" in model:
            return hf_chat_template(
-                model=model,
+                model="meta-llama/Meta-Llama-3-8B-Instruct",
                messages=messages,
                chat_template=known_tokenizer_config[  # type: ignore
                    "meta-llama/Meta-Llama-3-8B-Instruct"
                ]["tokenizer"]["chat_template"],
            )
        elif (
            "tiiuae/falcon" in model
--- a/litellm/llms/replicate.py
+++ b/litellm/llms/replicate.py
@ -112,10 +112,16 @@ def start_prediction(
    }
    initial_prediction_data = {
        "version": version_id,
        "input": input_data,
    }
    if ":" in version_id and len(version_id) > 64:
        model_parts = version_id.split(":")
        if (
            len(model_parts) > 1 and len(model_parts[1]) == 64
        ):  ## checks if model name has a 64 digit code - e.g. "meta/llama-2-70b-chat:02e509c789964a7ea8736978a43525956ef40397be9033abf9fd2badfe68c9e3"
            initial_prediction_data["version"] = model_parts[1]
    ## LOGGING
    logging_obj.pre_call(
        input=input_data["prompt"],
--- a/litellm/llms/vertex_ai.py
+++ b/litellm/llms/vertex_ai.py
@ -143,7 +143,9 @@ class VertexAIConfig:
                optional_params["temperature"] = value
            if param == "top_p":
                optional_params["top_p"] = value
-            if param == "stream":
+            if (
                param == "stream" and value == True
            ):  # sending stream = False, can cause it to get passed unchecked and raise issues
                optional_params["stream"] = value
            if param == "n":
                optional_params["candidate_count"] = value
@ -182,6 +184,20 @@ class VertexAIConfig:
                pass
        return optional_params
    def get_mapped_special_auth_params(self) -> dict:
        """
        Common auth params across bedrock/vertex_ai/azure/watsonx
        """
        return {"project": "vertex_project", "region_name": "vertex_location"}
    def map_special_auth_params(self, non_default_params: dict, optional_params: dict):
        mapped_params = self.get_mapped_special_auth_params()
        for param, value in non_default_params.items():
            if param in mapped_params:
                optional_params[mapped_params[param]] = value
        return optional_params
 import asyncio
@ -527,6 +543,7 @@ def completion(
                "instances": instances,
                "vertex_location": vertex_location,
                "vertex_project": vertex_project,
                "safety_settings": safety_settings,
                **optional_params,
            }
            if optional_params.get("stream", False) is True:
@ -541,8 +558,9 @@ def completion(
            tools = optional_params.pop("tools", None)
            prompt, images = _gemini_vision_convert_messages(messages=messages)
            content = [prompt] + images
-            if "stream" in optional_params and optional_params["stream"] == True:
+            stream = optional_params.pop("stream", False)
-                stream = optional_params.pop("stream")
+            if stream == True:
                request_str += f"response = llm_model.generate_content({content}, generation_config=GenerationConfig(**{optional_params}), safety_settings={safety_settings}, stream={stream})\n"
                logging_obj.pre_call(
                    input=prompt,
@ -810,6 +828,7 @@ async def async_completion(
    instances=None,
    vertex_project=None,
    vertex_location=None,
    safety_settings=None,
    **optional_params,
 ):
    """
@ -820,6 +839,7 @@ async def async_completion(
            print_verbose("\nMaking VertexAI Gemini Pro/Vision Call")
            print_verbose(f"\nProcessing input messages = {messages}")
            tools = optional_params.pop("tools", None)
            stream = optional_params.pop("stream", False)
            prompt, images = _gemini_vision_convert_messages(messages=messages)
            content = [prompt] + images
@ -840,6 +860,7 @@ async def async_completion(
            response = await llm_model._generate_content_async(
                contents=content,
                generation_config=optional_params,
                safety_settings=safety_settings,
                tools=tools,
            )
@ -1018,6 +1039,7 @@ async def async_streaming(
    instances=None,
    vertex_project=None,
    vertex_location=None,
    safety_settings=None,
    **optional_params,
 ):
    """
@ -1044,6 +1066,7 @@ async def async_streaming(
        response = await llm_model._generate_content_streaming_async(
            contents=content,
            generation_config=optional_params,
            safety_settings=safety_settings,
            tools=tools,
        )
--- a/litellm/llms/watsonx.py
+++ b/litellm/llms/watsonx.py
@ -0,0 +1,609 @@
 from enum import Enum
 import json, types, time  # noqa: E401
 from contextlib import contextmanager
 from typing import Callable, Dict, Optional, Any, Union, List
 import httpx
 import requests
 import litellm
 from litellm.utils import ModelResponse, get_secret, Usage
 from .base import BaseLLM
 from .prompt_templates import factory as ptf
 class WatsonXAIError(Exception):
    def __init__(self, status_code, message, url: Optional[str] = None):
        self.status_code = status_code
        self.message = message
        url = url or "https://https://us-south.ml.cloud.ibm.com"
        self.request = httpx.Request(method="POST", url=url)
        self.response = httpx.Response(status_code=status_code, request=self.request)
        super().__init__(
            self.message
        )  # Call the base class constructor with the parameters it needs
 class IBMWatsonXAIConfig:
    """
    Reference: https://cloud.ibm.com/apidocs/watsonx-ai#text-generation
    (See ibm_watsonx_ai.metanames.GenTextParamsMetaNames for a list of all available params)
    Supported params for all available watsonx.ai foundational models.
    - `decoding_method` (str): One of "greedy" or "sample"
    - `temperature` (float): Sets the model temperature for sampling - not available when decoding_method='greedy'.
    - `max_new_tokens` (integer): Maximum length of the generated tokens.
    - `min_new_tokens` (integer): Maximum length of input tokens. Any more than this will be truncated.
    - `length_penalty` (dict): A dictionary with keys "decay_factor" and "start_index".
    - `stop_sequences` (string[]): list of strings to use as stop sequences.
    - `top_k` (integer): top k for sampling - not available when decoding_method='greedy'.
    - `top_p` (integer): top p for sampling - not available when decoding_method='greedy'.
    - `repetition_penalty` (float): token repetition penalty during text generation.
    - `truncate_input_tokens` (integer): Truncate input tokens to this length.
    - `include_stop_sequences` (bool): If True, the stop sequence will be included at the end of the generated text in the case of a match.
    - `return_options` (dict): A dictionary of options to return. Options include "input_text", "generated_tokens", "input_tokens", "token_ranks". Values are boolean.
    - `random_seed` (integer): Random seed for text generation.
    - `moderations` (dict): Dictionary of properties that control the moderations, for usages such as Hate and profanity (HAP) and PII filtering.
    - `stream` (bool): If True, the model will return a stream of responses.
    """
    decoding_method: Optional[str] = "sample"
    temperature: Optional[float] = None
    max_new_tokens: Optional[int] = None  # litellm.max_tokens
    min_new_tokens: Optional[int] = None
    length_penalty: Optional[dict] = None  # e.g {"decay_factor": 2.5, "start_index": 5}
    stop_sequences: Optional[List[str]] = None  # e.g ["}", ")", "."]
    top_k: Optional[int] = None
    top_p: Optional[float] = None
    repetition_penalty: Optional[float] = None
    truncate_input_tokens: Optional[int] = None
    include_stop_sequences: Optional[bool] = False
    return_options: Optional[Dict[str, bool]] = None
    random_seed: Optional[int] = None  # e.g 42
    moderations: Optional[dict] = None
    stream: Optional[bool] = False
    def __init__(
        self,
        decoding_method: Optional[str] = None,
        temperature: Optional[float] = None,
        max_new_tokens: Optional[int] = None,
        min_new_tokens: Optional[int] = None,
        length_penalty: Optional[dict] = None,
        stop_sequences: Optional[List[str]] = None,
        top_k: Optional[int] = None,
        top_p: Optional[float] = None,
        repetition_penalty: Optional[float] = None,
        truncate_input_tokens: Optional[int] = None,
        include_stop_sequences: Optional[bool] = None,
        return_options: Optional[dict] = None,
        random_seed: Optional[int] = None,
        moderations: Optional[dict] = None,
        stream: Optional[bool] = None,
        **kwargs,
    ) -> None:
        locals_ = locals()
        for key, value in locals_.items():
            if key != "self" and value is not None:
                setattr(self.__class__, key, value)
    @classmethod
    def get_config(cls):
        return {
            k: v
            for k, v in cls.__dict__.items()
            if not k.startswith("__")
            and not isinstance(
                v,
                (
                    types.FunctionType,
                    types.BuiltinFunctionType,
                    classmethod,
                    staticmethod,
                ),
            )
            and v is not None
        }
    def get_supported_openai_params(self):
        return [
            "temperature",  # equivalent to temperature
            "max_tokens",  # equivalent to max_new_tokens
            "top_p",  # equivalent to top_p
            "frequency_penalty",  # equivalent to repetition_penalty
            "stop",  # equivalent to stop_sequences
            "seed",  # equivalent to random_seed
            "stream",  # equivalent to stream
        ]
    def get_mapped_special_auth_params(self) -> dict:
        """
        Common auth params across bedrock/vertex_ai/azure/watsonx
        """
        return {
            "project": "watsonx_project",
            "region_name": "watsonx_region_name",
            "token": "watsonx_token",
        }
    def map_special_auth_params(self, non_default_params: dict, optional_params: dict):
        mapped_params = self.get_mapped_special_auth_params()
        for param, value in non_default_params.items():
            if param in mapped_params:
                optional_params[mapped_params[param]] = value
        return optional_params
 def convert_messages_to_prompt(model, messages, provider, custom_prompt_dict):
    # handle anthropic prompts and amazon titan prompts
    if model in custom_prompt_dict:
        # check if the model has a registered custom prompt
        model_prompt_dict = custom_prompt_dict[model]
        prompt = ptf.custom_prompt(
            messages=messages,
            role_dict=model_prompt_dict.get(
                "role_dict", model_prompt_dict.get("roles")
            ),
            initial_prompt_value=model_prompt_dict.get("initial_prompt_value", ""),
            final_prompt_value=model_prompt_dict.get("final_prompt_value", ""),
            bos_token=model_prompt_dict.get("bos_token", ""),
            eos_token=model_prompt_dict.get("eos_token", ""),
        )
        return prompt
    elif provider == "ibm":
        prompt = ptf.prompt_factory(
            model=model, messages=messages, custom_llm_provider="watsonx"
        )
    elif provider == "ibm-mistralai":
        prompt = ptf.mistral_instruct_pt(messages=messages)
    else:
        prompt = ptf.prompt_factory(
            model=model, messages=messages, custom_llm_provider="watsonx"
        )
    return prompt
 class WatsonXAIEndpoint(str, Enum):
    TEXT_GENERATION = "/ml/v1/text/generation"
    TEXT_GENERATION_STREAM = "/ml/v1/text/generation_stream"
    DEPLOYMENT_TEXT_GENERATION = "/ml/v1/deployments/{deployment_id}/text/generation"
    DEPLOYMENT_TEXT_GENERATION_STREAM = (
        "/ml/v1/deployments/{deployment_id}/text/generation_stream"
    )
    EMBEDDINGS = "/ml/v1/text/embeddings"
    PROMPTS = "/ml/v1/prompts"
 class IBMWatsonXAI(BaseLLM):
    """
    Class to interface with IBM Watsonx.ai API for text generation and embeddings.
    Reference: https://cloud.ibm.com/apidocs/watsonx-ai
    """
    api_version = "2024-03-13"
    def __init__(self) -> None:
        super().__init__()
    def _prepare_text_generation_req(
        self,
        model_id: str,
        prompt: str,
        stream: bool,
        optional_params: dict,
        print_verbose: Optional[Callable] = None,
    ) -> dict:
        """
        Get the request parameters for text generation.
        """
        api_params = self._get_api_params(optional_params, print_verbose=print_verbose)
        # build auth headers
        api_token = api_params.get("token")
        headers = {
            "Authorization": f"Bearer {api_token}",
            "Content-Type": "application/json",
            "Accept": "application/json",
        }
        extra_body_params = optional_params.pop("extra_body", {})
        optional_params.update(extra_body_params)
        # init the payload to the text generation call
        payload = {
            "input": prompt,
            "moderations": optional_params.pop("moderations", {}),
            "parameters": optional_params,
        }
        request_params = dict(version=api_params["api_version"])
        # text generation endpoint deployment or model / stream or not
        if model_id.startswith("deployment/"):
            # deployment models are passed in as 'deployment/<deployment_id>'
            if api_params.get("space_id") is None:
                raise WatsonXAIError(
                    status_code=401,
                    url=api_params["url"],
                    message="Error: space_id is required for models called using the 'deployment/' endpoint. Pass in the space_id as a parameter or set it in the WX_SPACE_ID environment variable.",
                )
            deployment_id = "/".join(model_id.split("/")[1:])
            endpoint = (
                WatsonXAIEndpoint.DEPLOYMENT_TEXT_GENERATION_STREAM.value
                if stream
                else WatsonXAIEndpoint.DEPLOYMENT_TEXT_GENERATION.value
            )
            endpoint = endpoint.format(deployment_id=deployment_id)
        else:
            payload["model_id"] = model_id
            payload["project_id"] = api_params["project_id"]
            endpoint = (
                WatsonXAIEndpoint.TEXT_GENERATION_STREAM
                if stream
                else WatsonXAIEndpoint.TEXT_GENERATION
            )
        url = api_params["url"].rstrip("/") + endpoint
        return dict(
            method="POST", url=url, headers=headers, json=payload, params=request_params
        )
    def _get_api_params(
        self, params: dict, print_verbose: Optional[Callable] = None
    ) -> dict:
        """
        Find watsonx.ai credentials in the params or environment variables and return the headers for authentication.
        """
        # Load auth variables from params
        url = params.pop("url", params.pop("api_base", params.pop("base_url", None)))
        api_key = params.pop("apikey", None)
        token = params.pop("token", None)
        project_id = params.pop(
            "project_id", params.pop("watsonx_project", None)
        )  # watsonx.ai project_id - allow 'watsonx_project' to be consistent with how vertex project implementation works -> reduce provider-specific params
        space_id = params.pop("space_id", None)  # watsonx.ai deployment space_id
        region_name = params.pop("region_name", params.pop("region", None))
        if region_name is None:
            region_name = params.pop(
                "watsonx_region_name", params.pop("watsonx_region", None)
            )  # consistent with how vertex ai + aws regions are accepted
        wx_credentials = params.pop(
            "wx_credentials",
            params.pop(
                "watsonx_credentials", None
            ),  # follow {provider}_credentials, same as vertex ai
        )
        api_version = params.pop("api_version", IBMWatsonXAI.api_version)
        # Load auth variables from environment variables
        if url is None:
            url = (
                get_secret("WATSONX_API_BASE")  # consistent with 'AZURE_API_BASE'
                or get_secret("WATSONX_URL")
                or get_secret("WX_URL")
                or get_secret("WML_URL")
            )
        if api_key is None:
            api_key = (
                get_secret("WATSONX_APIKEY")
                or get_secret("WATSONX_API_KEY")
                or get_secret("WX_API_KEY")
            )
        if token is None:
            token = get_secret("WATSONX_TOKEN") or get_secret("WX_TOKEN")
        if project_id is None:
            project_id = (
                get_secret("WATSONX_PROJECT_ID")
                or get_secret("WX_PROJECT_ID")
                or get_secret("PROJECT_ID")
            )
        if region_name is None:
            region_name = (
                get_secret("WATSONX_REGION")
                or get_secret("WX_REGION")
                or get_secret("REGION")
            )
        if space_id is None:
            space_id = (
                get_secret("WATSONX_DEPLOYMENT_SPACE_ID")
                or get_secret("WATSONX_SPACE_ID")
                or get_secret("WX_SPACE_ID")
                or get_secret("SPACE_ID")
            )
        # credentials parsing
        if wx_credentials is not None:
            url = wx_credentials.get("url", url)
            api_key = wx_credentials.get(
                "apikey", wx_credentials.get("api_key", api_key)
            )
            token = wx_credentials.get(
                "token",
                wx_credentials.get(
                    "watsonx_token", token
                ),  # follow format of {provider}_token, same as azure - e.g. 'azure_ad_token=..'
            )
        # verify that all required credentials are present
        if url is None:
            raise WatsonXAIError(
                status_code=401,
                message="Error: Watsonx URL not set. Set WX_URL in environment variables or pass in as a parameter.",
            )
        if token is None and api_key is not None:
            # generate the auth token
            if print_verbose:
                print_verbose("Generating IAM token for Watsonx.ai")
            token = self.generate_iam_token(api_key)
        elif token is None and api_key is None:
            raise WatsonXAIError(
                status_code=401,
                url=url,
                message="Error: API key or token not found. Set WX_API_KEY or WX_TOKEN in environment variables or pass in as a parameter.",
            )
        if project_id is None:
            raise WatsonXAIError(
                status_code=401,
                url=url,
                message="Error: Watsonx project_id not set. Set WX_PROJECT_ID in environment variables or pass in as a parameter.",
            )
        return {
            "url": url,
            "api_key": api_key,
            "token": token,
            "project_id": project_id,
            "space_id": space_id,
            "region_name": region_name,
            "api_version": api_version,
        }
    def completion(
        self,
        model: str,
        messages: list,
        custom_prompt_dict: dict,
        model_response: ModelResponse,
        print_verbose: Callable,
        encoding,
        logging_obj,
        optional_params: dict,
        litellm_params: Optional[dict] = None,
        logger_fn=None,
        timeout: Optional[float] = None,
    ):
        """
        Send a text generation request to the IBM Watsonx.ai API.
        Reference: https://cloud.ibm.com/apidocs/watsonx-ai#text-generation
        """
        stream = optional_params.pop("stream", False)
        # Load default configs
        config = IBMWatsonXAIConfig.get_config()
        for k, v in config.items():
            if k not in optional_params:
                optional_params[k] = v
        # Make prompt to send to model
        provider = model.split("/")[0]
        # model_name = "/".join(model.split("/")[1:])
        prompt = convert_messages_to_prompt(
            model, messages, provider, custom_prompt_dict
        )
        def process_text_request(request_params: dict) -> ModelResponse:
            with self._manage_response(
                request_params, logging_obj=logging_obj, input=prompt, timeout=timeout
            ) as resp:
                json_resp = resp.json()
            generated_text = json_resp["results"][0]["generated_text"]
            prompt_tokens = json_resp["results"][0]["input_token_count"]
            completion_tokens = json_resp["results"][0]["generated_token_count"]
            model_response["choices"][0]["message"]["content"] = generated_text
            model_response["finish_reason"] = json_resp["results"][0]["stop_reason"]
            model_response["created"] = int(time.time())
            model_response["model"] = model
            setattr(
                model_response,
                "usage",
                Usage(
                    prompt_tokens=prompt_tokens,
                    completion_tokens=completion_tokens,
                    total_tokens=prompt_tokens + completion_tokens,
                ),
            )
            return model_response
        def process_stream_request(
            request_params: dict,
        ) -> litellm.CustomStreamWrapper:
            # stream the response - generated chunks will be handled
            # by litellm.utils.CustomStreamWrapper.handle_watsonx_stream
            with self._manage_response(
                request_params,
                logging_obj=logging_obj,
                stream=True,
                input=prompt,
                timeout=timeout,
            ) as resp:
                response = litellm.CustomStreamWrapper(
                    resp.iter_lines(),
                    model=model,
                    custom_llm_provider="watsonx",
                    logging_obj=logging_obj,
                )
            return response
        try:
            ## Get the response from the model
            req_params = self._prepare_text_generation_req(
                model_id=model,
                prompt=prompt,
                stream=stream,
                optional_params=optional_params,
                print_verbose=print_verbose,
            )
            if stream:
                return process_stream_request(req_params)
            else:
                return process_text_request(req_params)
        except WatsonXAIError as e:
            raise e
        except Exception as e:
            raise WatsonXAIError(status_code=500, message=str(e))
    def embedding(
        self,
        model: str,
        input: Union[list, str],
        api_key: Optional[str] = None,
        logging_obj=None,
        model_response=None,
        optional_params=None,
        encoding=None,
    ):
        """
        Send a text embedding request to the IBM Watsonx.ai API.
        """
        if optional_params is None:
            optional_params = {}
        # Load default configs
        config = IBMWatsonXAIConfig.get_config()
        for k, v in config.items():
            if k not in optional_params:
                optional_params[k] = v
        # Load auth variables from environment variables
        if isinstance(input, str):
            input = [input]
        if api_key is not None:
            optional_params["api_key"] = api_key
        api_params = self._get_api_params(optional_params)
        # build auth headers
        api_token = api_params.get("token")
        headers = {
            "Authorization": f"Bearer {api_token}",
            "Content-Type": "application/json",
            "Accept": "application/json",
        }
        # init the payload to the text generation call
        payload = {
            "inputs": input,
            "model_id": model,
            "project_id": api_params["project_id"],
            "parameters": optional_params,
        }
        request_params = dict(version=api_params["api_version"])
        url = api_params["url"].rstrip("/") + WatsonXAIEndpoint.EMBEDDINGS
        # request = httpx.Request(
        #     "POST", url, headers=headers, json=payload, params=request_params
        # )
        req_params = {
            "method": "POST",
            "url": url,
            "headers": headers,
            "json": payload,
            "params": request_params,
        }
        with self._manage_response(
            req_params, logging_obj=logging_obj, input=input
        ) as resp:
            json_resp = resp.json()
        results = json_resp.get("results", [])
        embedding_response = []
        for idx, result in enumerate(results):
            embedding_response.append(
                {"object": "embedding", "index": idx, "embedding": result["embedding"]}
            )
        model_response["object"] = "list"
        model_response["data"] = embedding_response
        model_response["model"] = model
        input_tokens = json_resp.get("input_token_count", 0)
        model_response.usage = Usage(
            prompt_tokens=input_tokens, completion_tokens=0, total_tokens=input_tokens
        )
        return model_response
    def generate_iam_token(self, api_key=None, **params):
        headers = {}
        headers["Content-Type"] = "application/x-www-form-urlencoded"
        if api_key is None:
            api_key = get_secret("WX_API_KEY") or get_secret("WATSONX_API_KEY")
        if api_key is None:
            raise ValueError("API key is required")
        headers["Accept"] = "application/json"
        data = {
            "grant_type": "urn:ibm:params:oauth:grant-type:apikey",
            "apikey": api_key,
        }
        response = httpx.post(
            "https://iam.cloud.ibm.com/identity/token", data=data, headers=headers
        )
        response.raise_for_status()
        json_data = response.json()
        iam_access_token = json_data["access_token"]
        self.token = iam_access_token
        return iam_access_token
    @contextmanager
    def _manage_response(
        self,
        request_params: dict,
        logging_obj: Any,
        stream: bool = False,
        input: Optional[Any] = None,
        timeout: Optional[float] = None,
    ):
        request_str = (
            f"response = {request_params['method']}(\n"
            f"\turl={request_params['url']},\n"
            f"\tjson={request_params['json']},\n"
            f")"
        )
        logging_obj.pre_call(
            input=input,
            api_key=request_params["headers"].get("Authorization"),
            additional_args={
                "complete_input_dict": request_params["json"],
                "request_str": request_str,
            },
        )
        if timeout:
            request_params["timeout"] = timeout
        try:
            if stream:
                resp = requests.request(
                    **request_params,
                    stream=True,
                )
                resp.raise_for_status()
                yield resp
            else:
                resp = requests.request(**request_params)
                resp.raise_for_status()
                yield resp
        except Exception as e:
            raise WatsonXAIError(status_code=500, message=str(e))
        if not stream:
            logging_obj.post_call(
                input=input,
                api_key=request_params["headers"].get("Authorization"),
                original_response=json.dumps(resp.json()),
                additional_args={
                    "status_code": resp.status_code,
                    "complete_input_dict": request_params["json"],
                },
            )
--- a/litellm/main.py
+++ b/litellm/main.py
@ -63,6 +63,7 @@ from .llms import (
    vertex_ai,
    vertex_ai_anthropic,
    maritalk,
    watsonx,
 )
 from .llms.openai import OpenAIChatCompletion, OpenAITextCompletion
 from .llms.azure import AzureChatCompletion
@ -360,7 +361,7 @@ def mock_completion(
    model: str,
    messages: List,
    stream: Optional[bool] = False,
-    mock_response: str = "This is a mock request",
+    mock_response: Union[str, Exception] = "This is a mock request",
    logging=None,
    **kwargs,
 ):
@ -387,6 +388,20 @@ def mock_completion(
        - If 'stream' is True, it returns a response that mimics the behavior of a streaming completion.
    """
    try:
        ## LOGGING
        if logging is not None:
            logging.pre_call(
                input=messages,
                api_key="mock-key",
            )
        if isinstance(mock_response, Exception):
            raise litellm.APIError(
                status_code=500,  # type: ignore
                message=str(mock_response),
                llm_provider="openai",  # type: ignore
                model=model,  # type: ignore
                request=httpx.Request(method="POST", url="https://api.openai.com/v1/"),
            )
        model_response = ModelResponse(stream=stream)
        if stream is True:
            # don't try to access stream object,
@ -1864,6 +1879,43 @@ def completion(
            ## RESPONSE OBJECT
            response = response
        elif custom_llm_provider == "watsonx":
            custom_prompt_dict = custom_prompt_dict or litellm.custom_prompt_dict
            response = watsonx.IBMWatsonXAI().completion(
                model=model,
                messages=messages,
                custom_prompt_dict=custom_prompt_dict,
                model_response=model_response,
                print_verbose=print_verbose,
                optional_params=optional_params,
                litellm_params=litellm_params,  # type: ignore
                logger_fn=logger_fn,
                encoding=encoding,
                logging_obj=logging,
                timeout=timeout,
            )
            if (
                "stream" in optional_params
                and optional_params["stream"] == True
                and not isinstance(response, CustomStreamWrapper)
            ):
                # don't try to access stream object,
                response = CustomStreamWrapper(
                    iter(response),
                    model,
                    custom_llm_provider="watsonx",
                    logging_obj=logging,
                )
            if optional_params.get("stream", False):
                ## LOGGING
                logging.post_call(
                    input=messages,
                    api_key=None,
                    original_response=response,
                )
            ## RESPONSE OBJECT
            response = response
        elif custom_llm_provider == "vllm":
            custom_prompt_dict = custom_prompt_dict or litellm.custom_prompt_dict
            model_response = vllm.completion(
@ -2943,6 +2995,15 @@ def embedding(
                client=client,
                aembedding=aembedding,
            )
        elif custom_llm_provider == "watsonx":
            response = watsonx.IBMWatsonXAI().embedding(
                model=model,
                input=input,
                encoding=encoding,
                logging_obj=logging,
                optional_params=optional_params,
                model_response=EmbeddingResponse(),
            )
        else:
            args = locals()
            raise ValueError(f"No valid embedding model args passed in - {args}")
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@ -1418,6 +1418,123 @@
        "litellm_provider": "replicate",
        "mode": "chat"
    },
    "replicate/meta/llama-2-13b": {
        "max_tokens": 4096,
        "max_input_tokens": 4096,
        "max_output_tokens": 4096,
        "input_cost_per_token": 0.0000001,
        "output_cost_per_token": 0.0000005,
        "litellm_provider": "replicate",
        "mode": "chat"
    },
    "replicate/meta/llama-2-13b-chat": {
        "max_tokens": 4096,
        "max_input_tokens": 4096,
        "max_output_tokens": 4096,
        "input_cost_per_token": 0.0000001,
        "output_cost_per_token": 0.0000005,
        "litellm_provider": "replicate",
        "mode": "chat"
    },
    "replicate/meta/llama-2-70b": {
        "max_tokens": 4096,
        "max_input_tokens": 4096,
        "max_output_tokens": 4096,
        "input_cost_per_token": 0.00000065,
        "output_cost_per_token": 0.00000275,
        "litellm_provider": "replicate",
        "mode": "chat"
    },
    "replicate/meta/llama-2-70b-chat": {
        "max_tokens": 4096,
        "max_input_tokens": 4096,
        "max_output_tokens": 4096,
        "input_cost_per_token": 0.00000065,
        "output_cost_per_token": 0.00000275,
        "litellm_provider": "replicate",
        "mode": "chat"
    },
    "replicate/meta/llama-2-7b": {
        "max_tokens": 4096,
        "max_input_tokens": 4096,
        "max_output_tokens": 4096,
        "input_cost_per_token": 0.00000005,
        "output_cost_per_token": 0.00000025,
        "litellm_provider": "replicate",
        "mode": "chat"
    },
    "replicate/meta/llama-2-7b-chat": {
        "max_tokens": 4096,
        "max_input_tokens": 4096,
        "max_output_tokens": 4096,
        "input_cost_per_token": 0.00000005,
        "output_cost_per_token": 0.00000025,
        "litellm_provider": "replicate",
        "mode": "chat"
    },
    "replicate/meta/llama-3-70b": {
        "max_tokens": 4096,
        "max_input_tokens": 4096,
        "max_output_tokens": 4096,
        "input_cost_per_token": 0.00000065,
        "output_cost_per_token": 0.00000275,
        "litellm_provider": "replicate",
        "mode": "chat"
    },
    "replicate/meta/llama-3-70b-instruct": {
        "max_tokens": 4096,
        "max_input_tokens": 4096,
        "max_output_tokens": 4096,
        "input_cost_per_token": 0.00000065,
        "output_cost_per_token": 0.00000275,
        "litellm_provider": "replicate",
        "mode": "chat"
    },
    "replicate/meta/llama-3-8b": {
        "max_tokens": 4096,
        "max_input_tokens": 4096,
        "max_output_tokens": 4096,
        "input_cost_per_token": 0.00000005,
        "output_cost_per_token": 0.00000025,
        "litellm_provider": "replicate",
        "mode": "chat"
    },
    "replicate/meta/llama-3-8b-instruct": {
        "max_tokens": 4096,
        "max_input_tokens": 4096,
        "max_output_tokens": 4096,
        "input_cost_per_token": 0.00000005,
        "output_cost_per_token": 0.00000025,
        "litellm_provider": "replicate",
        "mode": "chat"
    },
    "replicate/mistralai/mistral-7b-v0.1": {
        "max_tokens": 4096,
        "max_input_tokens": 4096,
        "max_output_tokens": 4096,
        "input_cost_per_token": 0.00000005,
        "output_cost_per_token": 0.00000025,
        "litellm_provider": "replicate",
        "mode": "chat"
    },
    "replicate/mistralai/mistral-7b-instruct-v0.2": {
        "max_tokens": 4096,
        "max_input_tokens": 4096,
        "max_output_tokens": 4096,
        "input_cost_per_token": 0.00000005,
        "output_cost_per_token": 0.00000025,
        "litellm_provider": "replicate",
        "mode": "chat"
    },
    "replicate/mistralai/mixtral-8x7b-instruct-v0.1": {
        "max_tokens": 4096,
        "max_input_tokens": 4096,
        "max_output_tokens": 4096,
        "input_cost_per_token": 0.0000003,
        "output_cost_per_token": 0.000001,
        "litellm_provider": "replicate",
        "mode": "chat"
    },
    "openrouter/openai/gpt-3.5-turbo": {
        "max_tokens": 4095,
        "input_cost_per_token": 0.0000015,
@ -1455,6 +1572,17 @@
        "litellm_provider": "openrouter",
        "mode": "chat"
    },
    "openrouter/anthropic/claude-3-opus": {
        "max_tokens": 4096,
        "max_input_tokens": 200000,
        "max_output_tokens": 4096,
        "input_cost_per_token": 0.000015,
        "output_cost_per_token": 0.000075,
        "litellm_provider": "openrouter",
        "mode": "chat",
        "supports_function_calling": true,
        "tool_use_system_prompt_tokens": 395
    },
    "openrouter/google/palm-2-chat-bison": {
        "max_tokens": 8000,
        "input_cost_per_token": 0.0000005,
@ -2379,6 +2507,24 @@
        "litellm_provider": "bedrock",
        "mode": "chat"
    },
    "meta.llama3-8b-instruct-v1:0": {
        "max_tokens": 8192, 
        "max_input_tokens": 8192, 
        "max_output_tokens": 8192, 
        "input_cost_per_token": 0.0000004,
        "output_cost_per_token": 0.0000006,
        "litellm_provider": "bedrock",
        "mode": "chat"
    },
    "meta.llama3-70b-instruct-v1:0": {
        "max_tokens": 8192, 
        "max_input_tokens": 8192, 
        "max_output_tokens": 8192, 
        "input_cost_per_token": 0.00000265,
        "output_cost_per_token": 0.0000035,
        "litellm_provider": "bedrock",
        "mode": "chat"
    },
    "512-x-512/50-steps/stability.stable-diffusion-xl-v0": {
        "max_tokens": 77, 
        "max_input_tokens": 77, 
--- a/litellm/proxy/_experimental/out/404.html
+++ b/litellm/proxy/_experimental/out/404.html
--- a/litellm/proxy/_experimental/out/_next/static/c5rha8cqAah-saaczjn02/_buildManifest.js
+++ b/litellm/proxy/_experimental/out/_next/static/c5rha8cqAah-saaczjn02/_buildManifest.js
--- a/litellm/proxy/_experimental/out/_next/static/c5rha8cqAah-saaczjn02/_ssgManifest.js
+++ b/litellm/proxy/_experimental/out/_next/static/c5rha8cqAah-saaczjn02/_ssgManifest.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/127-efd0436630e294eb.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/127-efd0436630e294eb.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/386-d811195b597a2122.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/386-d811195b597a2122.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/app/layout-bdfb585eb82bdab5.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/app/layout-bdfb585eb82bdab5.js
@ -0,0 +1 @@
 (self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[185],{93553:function(n,e,t){Promise.resolve().then(t.t.bind(t,63385,23)),Promise.resolve().then(t.t.bind(t,99646,23))},63385:function(){},99646:function(n){n.exports={style:{fontFamily:"'__Inter_12bbc4', '__Inter_Fallback_12bbc4'",fontStyle:"normal"},className:"__className_12bbc4"}}},function(n){n.O(0,[971,69,744],function(){return n(n.s=93553)}),_N_E=n.O()}]);
--- a/litellm/proxy/_experimental/out/_next/static/chunks/app/layout-ecc8c750567f72a0.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/app/layout-ecc8c750567f72a0.js
@ -1 +0,0 @@
 (self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[185],{87421:function(n,e,t){Promise.resolve().then(t.t.bind(t,99646,23)),Promise.resolve().then(t.t.bind(t,63385,23))},63385:function(){},99646:function(n){n.exports={style:{fontFamily:"'__Inter_c23dc8', '__Inter_Fallback_c23dc8'",fontStyle:"normal"},className:"__className_c23dc8"}}},function(n){n.O(0,[971,69,744],function(){return n(n.s=87421)}),_N_E=n.O()}]);
--- a/litellm/proxy/_experimental/out/_next/static/chunks/app/page-525d83925fd5350b.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/app/page-525d83925fd5350b.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/app/page-5a4a198eefedc775.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/app/page-5a4a198eefedc775.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/app/page-e0ee34389254cdf2.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/app/page-e0ee34389254cdf2.js
--- a/litellm/proxy/_experimental/out/_next/static/chunks/main-app-096338c8e1915716.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/main-app-096338c8e1915716.js
@ -1 +1 @@
-(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[744],{32028:function(e,n,t){Promise.resolve().then(t.t.bind(t,47690,23)),Promise.resolve().then(t.t.bind(t,48955,23)),Promise.resolve().then(t.t.bind(t,5613,23)),Promise.resolve().then(t.t.bind(t,11902,23)),Promise.resolve().then(t.t.bind(t,31778,23)),Promise.resolve().then(t.t.bind(t,77831,23))}},function(e){var n=function(n){return e(e.s=n)};e.O(0,[971,69],function(){return n(35317),n(32028)}),_N_E=e.O()}]);
+(self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[744],{70377:function(e,n,t){Promise.resolve().then(t.t.bind(t,47690,23)),Promise.resolve().then(t.t.bind(t,48955,23)),Promise.resolve().then(t.t.bind(t,5613,23)),Promise.resolve().then(t.t.bind(t,11902,23)),Promise.resolve().then(t.t.bind(t,31778,23)),Promise.resolve().then(t.t.bind(t,77831,23))}},function(e){var n=function(n){return e(e.s=n)};e.O(0,[971,69],function(){return n(35317),n(70377)}),_N_E=e.O()}]);
--- a/litellm/proxy/_experimental/out/_next/static/chunks/main-app-9b4fb13a7db53edf.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/main-app-9b4fb13a7db53edf.js
@ -1 +0,0 @@
 (self.webpackChunk_N_E=self.webpackChunk_N_E||[]).push([[744],{32028:function(e,n,t){Promise.resolve().then(t.t.bind(t,47690,23)),Promise.resolve().then(t.t.bind(t,48955,23)),Promise.resolve().then(t.t.bind(t,5613,23)),Promise.resolve().then(t.t.bind(t,11902,23)),Promise.resolve().then(t.t.bind(t,31778,23)),Promise.resolve().then(t.t.bind(t,77831,23))}},function(e){var n=function(n){return e(e.s=n)};e.O(0,[971,69],function(){return n(35317),n(32028)}),_N_E=e.O()}]);
--- a/litellm/proxy/_experimental/out/_next/static/chunks/webpack-65a932b4e8bd8abb.js
+++ b/litellm/proxy/_experimental/out/_next/static/chunks/webpack-65a932b4e8bd8abb.js
@ -1 +1 @@
-!function(){"use strict";var e,t,n,r,o,u,i,c,f,a={},l={};function d(e){var t=l[e];if(void 0!==t)return t.exports;var n=l[e]={id:e,loaded:!1,exports:{}},r=!0;try{a[e](n,n.exports,d),r=!1}finally{r&&delete l[e]}return n.loaded=!0,n.exports}d.m=a,e=[],d.O=function(t,n,r,o){if(n){o=o||0;for(var u=e.length;u>0&&e[u-1][2]>o;u--)e[u]=e[u-1];e[u]=[n,r,o];return}for(var i=1/0,u=0;u<e.length;u++){for(var n=e[u][0],r=e[u][1],o=e[u][2],c=!0,f=0;f<n.length;f++)i>=o&&Object.keys(d.O).every(function(e){return d.O[e](n[f])})?n.splice(f--,1):(c=!1,o<i&&(i=o));if(c){e.splice(u--,1);var a=r();void 0!==a&&(t=a)}}return t},d.n=function(e){var t=e&&e.__esModule?function(){return e.default}:function(){return e};return d.d(t,{a:t}),t},n=Object.getPrototypeOf?function(e){return Object.getPrototypeOf(e)}:function(e){return e.__proto__},d.t=function(e,r){if(1&r&&(e=this(e)),8&r||"object"==typeof e&&e&&(4&r&&e.__esModule||16&r&&"function"==typeof e.then))return e;var o=Object.create(null);d.r(o);var u={};t=t||[null,n({}),n([]),n(n)];for(var i=2&r&&e;"object"==typeof i&&!~t.indexOf(i);i=n(i))Object.getOwnPropertyNames(i).forEach(function(t){u[t]=function(){return e[t]}});return u.default=function(){return e},d.d(o,u),o},d.d=function(e,t){for(var n in t)d.o(t,n)&&!d.o(e,n)&&Object.defineProperty(e,n,{enumerable:!0,get:t[n]})},d.f={},d.e=function(e){return Promise.all(Object.keys(d.f).reduce(function(t,n){return d.f[n](e,t),t},[]))},d.u=function(e){},d.miniCssF=function(e){return"static/css/60d9f441227ccc7e.css"},d.g=function(){if("object"==typeof globalThis)return globalThis;try{return this||Function("return this")()}catch(e){if("object"==typeof window)return window}}(),d.o=function(e,t){return Object.prototype.hasOwnProperty.call(e,t)},r={},o="_N_E:",d.l=function(e,t,n,u){if(r[e]){r[e].push(t);return}if(void 0!==n)for(var i,c,f=document.getElementsByTagName("script"),a=0;a<f.length;a++){var l=f[a];if(l.getAttribute("src")==e||l.getAttribute("data-webpack")==o+n){i=l;break}}i||(c=!0,(i=document.createElement("script")).charset="utf-8",i.timeout=120,d.nc&&i.setAttribute("nonce",d.nc),i.setAttribute("data-webpack",o+n),i.src=d.tu(e)),r[e]=[t];var s=function(t,n){i.onerror=i.onload=null,clearTimeout(p);var o=r[e];if(delete r[e],i.parentNode&&i.parentNode.removeChild(i),o&&o.forEach(function(e){return e(n)}),t)return t(n)},p=setTimeout(s.bind(null,void 0,{type:"timeout",target:i}),12e4);i.onerror=s.bind(null,i.onerror),i.onload=s.bind(null,i.onload),c&&document.head.appendChild(i)},d.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},d.nmd=function(e){return e.paths=[],e.children||(e.children=[]),e},d.tt=function(){return void 0===u&&(u={createScriptURL:function(e){return e}},"undefined"!=typeof trustedTypes&&trustedTypes.createPolicy&&(u=trustedTypes.createPolicy("nextjs#bundler",u))),u},d.tu=function(e){return d.tt().createScriptURL(e)},d.p="/ui/_next/",i={272:0},d.f.j=function(e,t){var n=d.o(i,e)?i[e]:void 0;if(0!==n){if(n)t.push(n[2]);else if(272!=e){var r=new Promise(function(t,r){n=i[e]=[t,r]});t.push(n[2]=r);var o=d.p+d.u(e),u=Error();d.l(o,function(t){if(d.o(i,e)&&(0!==(n=i[e])&&(i[e]=void 0),n)){var r=t&&("load"===t.type?"missing":t.type),o=t&&t.target&&t.target.src;u.message="Loading chunk "+e+" failed.\n("+r+": "+o+")",u.name="ChunkLoadError",u.type=r,u.request=o,n[1](u)}},"chunk-"+e,e)}else i[e]=0}},d.O.j=function(e){return 0===i[e]},c=function(e,t){var n,r,o=t[0],u=t[1],c=t[2],f=0;if(o.some(function(e){return 0!==i[e]})){for(n in u)d.o(u,n)&&(d.m[n]=u[n]);if(c)var a=c(d)}for(e&&e(t);f<o.length;f++)r=o[f],d.o(i,r)&&i[r]&&i[r][0](),i[r]=0;return d.O(a)},(f=self.webpackChunk_N_E=self.webpackChunk_N_E||[]).forEach(c.bind(null,0)),f.push=c.bind(null,f.push.bind(f))}();
+!function(){"use strict";var e,t,n,r,o,u,i,c,f,a={},l={};function d(e){var t=l[e];if(void 0!==t)return t.exports;var n=l[e]={id:e,loaded:!1,exports:{}},r=!0;try{a[e](n,n.exports,d),r=!1}finally{r&&delete l[e]}return n.loaded=!0,n.exports}d.m=a,e=[],d.O=function(t,n,r,o){if(n){o=o||0;for(var u=e.length;u>0&&e[u-1][2]>o;u--)e[u]=e[u-1];e[u]=[n,r,o];return}for(var i=1/0,u=0;u<e.length;u++){for(var n=e[u][0],r=e[u][1],o=e[u][2],c=!0,f=0;f<n.length;f++)i>=o&&Object.keys(d.O).every(function(e){return d.O[e](n[f])})?n.splice(f--,1):(c=!1,o<i&&(i=o));if(c){e.splice(u--,1);var a=r();void 0!==a&&(t=a)}}return t},d.n=function(e){var t=e&&e.__esModule?function(){return e.default}:function(){return e};return d.d(t,{a:t}),t},n=Object.getPrototypeOf?function(e){return Object.getPrototypeOf(e)}:function(e){return e.__proto__},d.t=function(e,r){if(1&r&&(e=this(e)),8&r||"object"==typeof e&&e&&(4&r&&e.__esModule||16&r&&"function"==typeof e.then))return e;var o=Object.create(null);d.r(o);var u={};t=t||[null,n({}),n([]),n(n)];for(var i=2&r&&e;"object"==typeof i&&!~t.indexOf(i);i=n(i))Object.getOwnPropertyNames(i).forEach(function(t){u[t]=function(){return e[t]}});return u.default=function(){return e},d.d(o,u),o},d.d=function(e,t){for(var n in t)d.o(t,n)&&!d.o(e,n)&&Object.defineProperty(e,n,{enumerable:!0,get:t[n]})},d.f={},d.e=function(e){return Promise.all(Object.keys(d.f).reduce(function(t,n){return d.f[n](e,t),t},[]))},d.u=function(e){},d.miniCssF=function(e){return"static/css/9f51f0573c6b0365.css"},d.g=function(){if("object"==typeof globalThis)return globalThis;try{return this||Function("return this")()}catch(e){if("object"==typeof window)return window}}(),d.o=function(e,t){return Object.prototype.hasOwnProperty.call(e,t)},r={},o="_N_E:",d.l=function(e,t,n,u){if(r[e]){r[e].push(t);return}if(void 0!==n)for(var i,c,f=document.getElementsByTagName("script"),a=0;a<f.length;a++){var l=f[a];if(l.getAttribute("src")==e||l.getAttribute("data-webpack")==o+n){i=l;break}}i||(c=!0,(i=document.createElement("script")).charset="utf-8",i.timeout=120,d.nc&&i.setAttribute("nonce",d.nc),i.setAttribute("data-webpack",o+n),i.src=d.tu(e)),r[e]=[t];var s=function(t,n){i.onerror=i.onload=null,clearTimeout(p);var o=r[e];if(delete r[e],i.parentNode&&i.parentNode.removeChild(i),o&&o.forEach(function(e){return e(n)}),t)return t(n)},p=setTimeout(s.bind(null,void 0,{type:"timeout",target:i}),12e4);i.onerror=s.bind(null,i.onerror),i.onload=s.bind(null,i.onload),c&&document.head.appendChild(i)},d.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},d.nmd=function(e){return e.paths=[],e.children||(e.children=[]),e},d.tt=function(){return void 0===u&&(u={createScriptURL:function(e){return e}},"undefined"!=typeof trustedTypes&&trustedTypes.createPolicy&&(u=trustedTypes.createPolicy("nextjs#bundler",u))),u},d.tu=function(e){return d.tt().createScriptURL(e)},d.p="/ui/_next/",i={272:0},d.f.j=function(e,t){var n=d.o(i,e)?i[e]:void 0;if(0!==n){if(n)t.push(n[2]);else if(272!=e){var r=new Promise(function(t,r){n=i[e]=[t,r]});t.push(n[2]=r);var o=d.p+d.u(e),u=Error();d.l(o,function(t){if(d.o(i,e)&&(0!==(n=i[e])&&(i[e]=void 0),n)){var r=t&&("load"===t.type?"missing":t.type),o=t&&t.target&&t.target.src;u.message="Loading chunk "+e+" failed.\n("+r+": "+o+")",u.name="ChunkLoadError",u.type=r,u.request=o,n[1](u)}},"chunk-"+e,e)}else i[e]=0}},d.O.j=function(e){return 0===i[e]},c=function(e,t){var n,r,o=t[0],u=t[1],c=t[2],f=0;if(o.some(function(e){return 0!==i[e]})){for(n in u)d.o(u,n)&&(d.m[n]=u[n]);if(c)var a=c(d)}for(e&&e(t);f<o.length;f++)r=o[f],d.o(i,r)&&i[r]&&i[r][0](),i[r]=0;return d.O(a)},(f=self.webpackChunk_N_E=self.webpackChunk_N_E||[]).forEach(c.bind(null,0)),f.push=c.bind(null,f.push.bind(f))}();
--- a/litellm/proxy/_experimental/out/_next/static/css/60d9f441227ccc7e.css
+++ b/litellm/proxy/_experimental/out/_next/static/css/60d9f441227ccc7e.css
--- a/litellm/proxy/_experimental/out/_next/static/css/9f51f0573c6b0365.css
+++ b/litellm/proxy/_experimental/out/_next/static/css/9f51f0573c6b0365.css
--- a/litellm/proxy/_experimental/out/_next/static/dWGL92c5LzTMn7XX6utn2/_buildManifest.js
+++ b/litellm/proxy/_experimental/out/_next/static/dWGL92c5LzTMn7XX6utn2/_buildManifest.js
--- a/litellm/proxy/_experimental/out/_next/static/dWGL92c5LzTMn7XX6utn2/_ssgManifest.js
+++ b/litellm/proxy/_experimental/out/_next/static/dWGL92c5LzTMn7XX6utn2/_ssgManifest.js
--- a/litellm/proxy/_experimental/out/index.html
+++ b/litellm/proxy/_experimental/out/index.html
@ -1 +1,5 @@
-<!DOCTYPE html><html id="__next_error__"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-50c1dadc6557c101.js" crossorigin=""/><script src="/ui/_next/static/chunks/fd9d1056-dafd44dfa2da140c.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/69-e49705773ae41779.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/main-app-9b4fb13a7db53edf.js" async="" crossorigin=""></script><title>LiteLLM Dashboard</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js" crossorigin="" noModule=""></script></head><body><script src="/ui/_next/static/chunks/webpack-50c1dadc6557c101.js" crossorigin="" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/60d9f441227ccc7e.css\",\"style\",{\"crossOrigin\":\"\"}]\n0:\"$L3\"\n"])</script><script>self.__next_f.push([1,"4:I[47690,[],\"\"]\n6:I[77831,[],\"\"]\n7:I[82332,[\"127\",\"static/chunks/127-efd0436630e294eb.js\",\"931\",\"static/chunks/app/page-525d83925fd5350b.js\"],\"\"]\n8:I[5613,[],\"\"]\n9:I[31778,[],\"\"]\nb:I[48955,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"3:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/60d9f441227ccc7e.css\",\"precedence\":\"next\",\"crossOrigin\":\"\"}]],[\"$\",\"$L4\",null,{\"buildId\":\"Csz8BqWx6JEoKsgLqCeCt\",\"assetPrefix\":\"/ui\",\"initialCanonicalUrl\":\"/\",\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[\"$L5\",[\"$\",\"$L6\",null,{\"propsForComponent\":{\"params\":{}},\"Component\":\"$7\",\"isStaticGeneration\":true}],null]]},[null,[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_c23dc8\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[],\"styles\":null}]}]}],null]],\"initialHead\":[false,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"LiteLLM Dashboard\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script><script>self.__next_f.push([1,""])</script></body></html>
+<<<<<<< HEAD
 <!DOCTYPE html><html id="__next_error__"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-202e312607f242a1.js" crossorigin=""/><script src="/ui/_next/static/chunks/fd9d1056-dafd44dfa2da140c.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/69-e49705773ae41779.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/main-app-9b4fb13a7db53edf.js" async="" crossorigin=""></script><title>LiteLLM Dashboard</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js" crossorigin="" noModule=""></script></head><body><script src="/ui/_next/static/chunks/webpack-202e312607f242a1.js" crossorigin="" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/00c2ddbcd01819c0.css\",\"style\",{\"crossOrigin\":\"\"}]\n0:\"$L3\"\n"])</script><script>self.__next_f.push([1,"4:I[47690,[],\"\"]\n6:I[77831,[],\"\"]\n7:I[46414,[\"761\",\"static/chunks/761-05f8a8451296476c.js\",\"931\",\"static/chunks/app/page-5a4a198eefedc775.js\"],\"\"]\n8:I[5613,[],\"\"]\n9:I[31778,[],\"\"]\nb:I[48955,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"3:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/00c2ddbcd01819c0.css\",\"precedence\":\"next\",\"crossOrigin\":\"\"}]],[\"$\",\"$L4\",null,{\"buildId\":\"c5rha8cqAah-saaczjn02\",\"assetPrefix\":\"/ui\",\"initialCanonicalUrl\":\"/\",\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[\"$L5\",[\"$\",\"$L6\",null,{\"propsForComponent\":{\"params\":{}},\"Component\":\"$7\",\"isStaticGeneration\":true}],null]]},[null,[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_c23dc8\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[],\"styles\":null}]}]}],null]],\"initialHead\":[false,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"LiteLLM Dashboard\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script><script>self.__next_f.push([1,""])</script></body></html>
 =======
 <!DOCTYPE html><html id="__next_error__"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-65a932b4e8bd8abb.js" crossorigin=""/><script src="/ui/_next/static/chunks/fd9d1056-dafd44dfa2da140c.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/69-e49705773ae41779.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/main-app-096338c8e1915716.js" async="" crossorigin=""></script><title>LiteLLM Dashboard</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js" crossorigin="" noModule=""></script></head><body><script src="/ui/_next/static/chunks/webpack-65a932b4e8bd8abb.js" crossorigin="" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/9f51f0573c6b0365.css\",\"style\",{\"crossOrigin\":\"\"}]\n0:\"$L3\"\n"])</script><script>self.__next_f.push([1,"4:I[47690,[],\"\"]\n6:I[77831,[],\"\"]\n7:I[46414,[\"386\",\"static/chunks/386-d811195b597a2122.js\",\"931\",\"static/chunks/app/page-e0ee34389254cdf2.js\"],\"\"]\n8:I[5613,[],\"\"]\n9:I[31778,[],\"\"]\nb:I[48955,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"3:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/9f51f0573c6b0365.css\",\"precedence\":\"next\",\"crossOrigin\":\"\"}]],[\"$\",\"$L4\",null,{\"buildId\":\"dWGL92c5LzTMn7XX6utn2\",\"assetPrefix\":\"/ui\",\"initialCanonicalUrl\":\"/\",\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[\"$L5\",[\"$\",\"$L6\",null,{\"propsForComponent\":{\"params\":{}},\"Component\":\"$7\",\"isStaticGeneration\":true}],null]]},[null,[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_12bbc4\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[],\"styles\":null}]}]}],null]],\"initialHead\":[false,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"LiteLLM Dashboard\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script><script>self.__next_f.push([1,""])</script></body></html>
 >>>>>>> 73a7b4f4 (refactor(main.py): trigger new build)
--- a/litellm/proxy/_experimental/out/index.txt
+++ b/litellm/proxy/_experimental/out/index.txt
@ -1,7 +1,14 @@
 2:I[77831,[],""]
-3:I[82332,["127","static/chunks/127-efd0436630e294eb.js","931","static/chunks/app/page-525d83925fd5350b.js"],""]
+<<<<<<< HEAD
 3:I[46414,["761","static/chunks/761-05f8a8451296476c.js","931","static/chunks/app/page-5a4a198eefedc775.js"],""]
 4:I[5613,[],""]
 5:I[31778,[],""]
-0:["Csz8BqWx6JEoKsgLqCeCt",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_c23dc8","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/60d9f441227ccc7e.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
+0:["c5rha8cqAah-saaczjn02",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_c23dc8","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/00c2ddbcd01819c0.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
 =======
 3:I[46414,["386","static/chunks/386-d811195b597a2122.js","931","static/chunks/app/page-e0ee34389254cdf2.js"],""]
 4:I[5613,[],""]
 5:I[31778,[],""]
 0:["dWGL92c5LzTMn7XX6utn2",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_12bbc4","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/9f51f0573c6b0365.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
 >>>>>>> 73a7b4f4 (refactor(main.py): trigger new build)
 6:[["$","meta","0",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","meta","1",{"charSet":"utf-8"}],["$","title","2",{"children":"LiteLLM Dashboard"}],["$","meta","3",{"name":"description","content":"LiteLLM Proxy Admin UI"}],["$","link","4",{"rel":"icon","href":"/ui/favicon.ico","type":"image/x-icon","sizes":"16x16"}],["$","meta","5",{"name":"next-size-adjust"}]]
 1:null
--- a/litellm/proxy/_super_secret_config.yaml
+++ b/litellm/proxy/_super_secret_config.yaml
@ -1,51 +1,15 @@
 environment_variables:
  SLACK_WEBHOOK_URL: SQD2/FQHvDuj6Q9/Umyqi+EKLNKKLRCXETX2ncO0xCIQp6EHCKiYD7jPW0+1QdrsQ+pnEzhsfVY2r21SiQV901n/9iyJ2tSnEyWViP7FKQVtTvwutsAqSqbiVHxLHbpjPCu03fhS/idjZrtK7dJLbLBB3RgudjNjHg==
 general_settings:
  alerting:
  - slack
  alerting_threshold: 300
  database_connection_pool_limit: 100
  database_connection_timeout: 60
  health_check_interval: 300
  proxy_batch_write_at: 10
  ui_access_mode: all
 litellm_settings:
  allowed_fails: 3
  failure_callback:
  - prometheus
  fallbacks:
  - gpt-3.5-turbo:
    - fake-openai-endpoint
    - gpt-4
  num_retries: 3
  service_callback:
  - prometheus_system
  success_callback:
  - prometheus
 model_list:
 - litellm_params:
    api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/
    api_key: my-fake-key
    model: openai/my-fake-model
  model_name: fake-openai-endpoint
 - litellm_params:
    model: gpt-3.5-turbo
  model_name: gpt-3.5-turbo
 - model_name: llama-3
  litellm_params:
    model: replicate/meta/meta-llama-3-8b-instruct
 router_settings:
-  allowed_fails: 3
+  num_retries: 0
-  context_window_fallbacks: null
+  enable_pre_call_checks: true
-  cooldown_time: 1
+  redis_host: os.environ/REDIS_HOST
-  fallbacks:
+  redis_password: os.environ/REDIS_PASSWORD
-  - gpt-3.5-turbo:
+  redis_port: os.environ/REDIS_PORT
-    - fake-openai-endpoint
+
-    - gpt-4
+litellm_settings:
-  - gpt-3.5-turbo-3:
+  success_callback: ["openmeter"]
    - fake-openai-endpoint
  num_retries: 3
  retry_after: 0
  routing_strategy: simple-shuffle
  routing_strategy_args: {}
  timeout: 6000
--- a/litellm/proxy/_types.py
+++ b/litellm/proxy/_types.py
@ -422,6 +422,9 @@ class LiteLLM_ModelTable(LiteLLMBase):
    created_by: str
    updated_by: str
    class Config:
        protected_namespaces = ()
 class NewUserRequest(GenerateKeyRequest):
    max_budget: Optional[float] = None
@ -485,6 +488,9 @@ class TeamBase(LiteLLMBase):
 class NewTeamRequest(TeamBase):
    model_aliases: Optional[dict] = None
    class Config:
        protected_namespaces = ()
 class GlobalEndUsersSpend(LiteLLMBase):
    api_key: Optional[str] = None
@ -534,6 +540,9 @@ class LiteLLM_TeamTable(TeamBase):
    budget_reset_at: Optional[datetime] = None
    model_id: Optional[int] = None
    class Config:
        protected_namespaces = ()
    @root_validator(pre=True)
    def set_model_info(cls, values):
        dict_fields = [
@ -570,6 +579,9 @@ class LiteLLM_BudgetTable(LiteLLMBase):
    model_max_budget: Optional[dict] = None
    budget_duration: Optional[str] = None
    class Config:
        protected_namespaces = ()
 class NewOrganizationRequest(LiteLLM_BudgetTable):
    organization_id: Optional[str] = None
@ -720,6 +732,10 @@ class ConfigGeneralSettings(LiteLLMBase):
        None,
        description="List of alerting types. By default it is all alerts",
    )
    alert_to_webhook_url: Optional[Dict] = Field(
        None,
        description="Mapping of alert type to webhook url. e.g. `alert_to_webhook_url: {'budget_alerts': 'https://hooks.slack.com/services/T00000000/B00000000/XXXXXXXXXXXXXXXXXXXXXXXX'}`",
    )
    alerting_threshold: Optional[int] = Field(
        None,
@ -896,5 +912,19 @@ class LiteLLM_SpendLogs(LiteLLMBase):
    request_tags: Optional[Json] = None
 class LiteLLM_ErrorLogs(LiteLLMBase):
    request_id: Optional[str] = str(uuid.uuid4())
    api_base: Optional[str] = ""
    model_group: Optional[str] = ""
    litellm_model_name: Optional[str] = ""
    model_id: Optional[str] = ""
    request_kwargs: Optional[dict] = {}
    exception_type: Optional[str] = ""
    status_code: Optional[str] = ""
    exception_string: Optional[str] = ""
    startTime: Union[str, datetime, None]
    endTime: Union[str, datetime, None]
 class LiteLLM_SpendLogs_ResponseObject(LiteLLMBase):
    response: Optional[List[Union[LiteLLM_SpendLogs, Any]]] = None
--- a/litellm/proxy/auth/auth_checks.py
+++ b/litellm/proxy/auth/auth_checks.py
@ -95,7 +95,15 @@ def common_checks(
                f"'user' param not passed in. 'enforce_user_param'={general_settings['enforce_user_param']}"
            )
    # 7. [OPTIONAL] If 'litellm.max_budget' is set (>0), is proxy under budget
-    if litellm.max_budget > 0 and global_proxy_spend is not None:
+    if (
        litellm.max_budget > 0
        and global_proxy_spend is not None
        # only run global budget checks for OpenAI routes
        # Reason - the Admin UI should continue working if the proxy crosses it's global budget
        and route in LiteLLMRoutes.openai_routes.value
        and route != "/v1/models"
        and route != "/models"
    ):
        if global_proxy_spend > litellm.max_budget:
            raise Exception(
                f"ExceededBudget: LiteLLM Proxy has exceeded its budget. Current spend: {global_proxy_spend}; Max Budget: {litellm.max_budget}"
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@ -1059,8 +1059,18 @@ async def user_api_key_auth(
                ):
                    pass
                else:
                    user_role = "unknown"
                    user_id = "unknown"
                    if user_id_information is not None and isinstance(
                        user_id_information, list
                    ):
                        _user = user_id_information[0]
                        user_role = _user.get("user_role", {}).get(
                            "user_role", "unknown"
                        )
                        user_id = _user.get("user_id", "unknown")
                    raise Exception(
-                        f"Only master key can be used to generate, delete, update info for new keys/users/teams. Route={route}"
+                        f"Only proxy admin can be used to generate, delete, update info for new keys/users/teams. Route={route}. Your role={user_role}. Your user_id={user_id}"
                    )
        # check if token is from litellm-ui, litellm ui makes keys to allow users to login with sso. These keys can only be used for LiteLLM UI functions
@ -1207,6 +1217,68 @@ def cost_tracking():
                litellm.success_callback.append(_PROXY_track_cost_callback)  # type: ignore
 async def _PROXY_failure_handler(
    kwargs,  # kwargs to completion
    completion_response: litellm.ModelResponse,  # response from completion
    start_time=None,
    end_time=None,  # start/end time for completion
 ):
    global prisma_client
    if prisma_client is not None:
        verbose_proxy_logger.debug(
            "inside _PROXY_failure_handler kwargs=", extra=kwargs
        )
        _exception = kwargs.get("exception")
        _exception_type = _exception.__class__.__name__
        _model = kwargs.get("model", None)
        _optional_params = kwargs.get("optional_params", {})
        _optional_params = copy.deepcopy(_optional_params)
        for k, v in _optional_params.items():
            v = str(v)
            v = v[:100]
        _status_code = "500"
        try:
            _status_code = str(_exception.status_code)
        except:
            # Don't let this fail logging the exception to the dB
            pass
        _litellm_params = kwargs.get("litellm_params", {}) or {}
        _metadata = _litellm_params.get("metadata", {}) or {}
        _model_id = _metadata.get("model_info", {}).get("id", "")
        _model_group = _metadata.get("model_group", "")
        api_base = litellm.get_api_base(model=_model, optional_params=_litellm_params)
        _exception_string = str(_exception)[:500]
        error_log = LiteLLM_ErrorLogs(
            request_id=str(uuid.uuid4()),
            model_group=_model_group,
            model_id=_model_id,
            litellm_model_name=kwargs.get("model"),
            request_kwargs=_optional_params,
            api_base=api_base,
            exception_type=_exception_type,
            status_code=_status_code,
            exception_string=_exception_string,
            startTime=kwargs.get("start_time"),
            endTime=kwargs.get("end_time"),
        )
        # helper function to convert to dict on pydantic v2 & v1
        error_log_dict = _get_pydantic_json_dict(error_log)
        error_log_dict["request_kwargs"] = json.dumps(error_log_dict["request_kwargs"])
        await prisma_client.db.litellm_errorlogs.create(
            data=error_log_dict  # type: ignore
        )
    pass
 async def _PROXY_track_cost_callback(
    kwargs,  # kwargs to completion
    completion_response: litellm.ModelResponse,  # response from completion
@ -1292,6 +1364,15 @@ async def _PROXY_track_cost_callback(
        verbose_proxy_logger.debug("error in tracking cost callback - %s", e)
 def error_tracking():
    global prisma_client, custom_db_client
    if prisma_client is not None or custom_db_client is not None:
        if isinstance(litellm.failure_callback, list):
            verbose_proxy_logger.debug("setting litellm failure callback to track cost")
            if (_PROXY_failure_handler) not in litellm.failure_callback:  # type: ignore
                litellm.failure_callback.append(_PROXY_failure_handler)  # type: ignore
 def _set_spend_logs_payload(
    payload: dict, prisma_client: PrismaClient, spend_logs_url: Optional[str] = None
 ):
@ -2612,9 +2693,10 @@ class ProxyConfig:
        environment_variables = config_data.get("environment_variables", {})
        for k, v in environment_variables.items():
            try:
-                decoded_b64 = base64.b64decode(v)
+                if v is not None:
-                value = decrypt_value(value=decoded_b64, master_key=master_key)  # type: ignore
+                    decoded_b64 = base64.b64decode(v)
-                os.environ[k] = value
+                    value = decrypt_value(value=decoded_b64, master_key=master_key)  # type: ignore
                    os.environ[k] = value
            except Exception as e:
                verbose_proxy_logger.error(
                    "Error setting env variable: %s - %s", k, str(e)
@ -2632,9 +2714,17 @@ class ProxyConfig:
        if "alert_types" in _general_settings:
            general_settings["alert_types"] = _general_settings["alert_types"]
            proxy_logging_obj.alert_types = general_settings["alert_types"]
-            proxy_logging_obj.slack_alerting_instance.alert_types = general_settings[
+            proxy_logging_obj.slack_alerting_instance.update_values(
-                "alert_types"
+                alert_types=general_settings["alert_types"]
            )
        if "alert_to_webhook_url" in _general_settings:
            general_settings["alert_to_webhook_url"] = _general_settings[
                "alert_to_webhook_url"
            ]
            proxy_logging_obj.slack_alerting_instance.update_values(
                alert_to_webhook_url=general_settings["alert_to_webhook_url"]
            )
        # router settings
        if llm_router is not None and prisma_client is not None:
@ -3176,6 +3266,9 @@ async def startup_event():
    ## COST TRACKING ##
    cost_tracking()
    ## Error Tracking ##
    error_tracking()
    db_writer_client = HTTPHandler()
    proxy_logging_obj._init_litellm_callbacks()  # INITIALIZE LITELLM CALLBACKS ON SERVER STARTUP <- do this to catch any logging errors on startup, not when calls are being made
@ -3655,6 +3748,17 @@ async def chat_completion(
        if data["model"] in litellm.model_alias_map:
            data["model"] = litellm.model_alias_map[data["model"]]
        ## LOGGING OBJECT ## - initialize logging object for logging success/failure events for call
        data["litellm_call_id"] = str(uuid.uuid4())
        logging_obj, data = litellm.utils.function_setup(
            original_function="acompletion",
            rules_obj=litellm.utils.Rules(),
            start_time=datetime.now(),
            **data,
        )
        data["litellm_logging_obj"] = logging_obj
        ### CALL HOOKS ### - modify incoming data before calling the model
        data = await proxy_logging_obj.pre_call_hook(
            user_api_key_dict=user_api_key_dict, data=data, call_type="completion"
@ -7421,9 +7525,9 @@ async def model_info_v2(
 )
 async def model_metrics(
    user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
-    _selected_model_group: Optional[str] = None,
+    _selected_model_group: Optional[str] = "gpt-4-32k",
-    startTime: Optional[datetime] = datetime.now() - timedelta(days=30),
+    startTime: Optional[datetime] = None,
-    endTime: Optional[datetime] = datetime.now(),
+    endTime: Optional[datetime] = None,
 ):
    global prisma_client, llm_router
    if prisma_client is None:
@ -7433,65 +7537,214 @@ async def model_metrics(
            param="None",
            code=status.HTTP_500_INTERNAL_SERVER_ERROR,
        )
-    if _selected_model_group and llm_router is not None:
+    startTime = startTime or datetime.now() - timedelta(days=30)
-        _model_list = llm_router.get_model_list()
+    endTime = endTime or datetime.now()
-        _relevant_api_bases = []
+
-        for model in _model_list:
+    sql_query = """
-            if model["model_name"] == _selected_model_group:
+        SELECT
-                _litellm_params = model["litellm_params"]
+            api_base,
-                _api_base = _litellm_params.get("api_base", "")
+            model,
-                _relevant_api_bases.append(_api_base)
+            DATE_TRUNC('day', "startTime")::DATE AS day,
-                _relevant_api_bases.append(_api_base + "/openai/")
+            AVG(EXTRACT(epoch FROM ("endTime" - "startTime"))) / SUM(total_tokens) AS avg_latency_per_token
        FROM
            "LiteLLM_SpendLogs"
        WHERE
            "startTime" >= NOW() - INTERVAL '30 days'
            AND "model" = $1 AND "cache_hit" != 'True'
        GROUP BY
            api_base,
            model,
            day
        HAVING
            SUM(total_tokens) > 0
        ORDER BY
            avg_latency_per_token DESC;
    """
    _all_api_bases = set()
    db_response = await prisma_client.db.query_raw(
        sql_query, _selected_model_group, startTime, endTime
    )
    _daily_entries: dict = {}  # {"Jun 23": {"model1": 0.002, "model2": 0.003}}
    if db_response is not None:
        for model_data in db_response:
            _api_base = model_data["api_base"]
            _model = model_data["model"]
            _day = model_data["day"]
            _avg_latency_per_token = model_data["avg_latency_per_token"]
            if _day not in _daily_entries:
                _daily_entries[_day] = {}
            _combined_model_name = str(_model)
            if "https://" in _api_base:
                _combined_model_name = str(_api_base)
            if "/openai/" in _combined_model_name:
                _combined_model_name = _combined_model_name.split("/openai/")[0]
            _all_api_bases.add(_combined_model_name)
            _daily_entries[_day][_combined_model_name] = _avg_latency_per_token
        sql_query = """
            SELECT
                CASE WHEN api_base = '' THEN model ELSE CONCAT(model, '-', api_base) END AS combined_model_api_base,
                COUNT(*) AS num_requests,
                AVG(EXTRACT(epoch FROM ("endTime" - "startTime"))) AS avg_latency_seconds
            FROM "LiteLLM_SpendLogs"
            WHERE "startTime" >= $1::timestamp AND "endTime" <= $2::timestamp
            AND api_base = ANY($3)
            GROUP BY CASE WHEN api_base = '' THEN model ELSE CONCAT(model, '-', api_base) END
            ORDER BY num_requests DESC
            LIMIT 50;
        """
        each entry needs to be like this:
        {
            date: 'Jun 23',
            'gpt-4-https://api.openai.com/v1/': 0.002,
            'gpt-43-https://api.openai.com-12/v1/': 0.002,
        }
        """
        # convert daily entries to list of dicts
-        db_response = await prisma_client.db.query_raw(
+        response: List[dict] = []
-            sql_query, startTime, endTime, _relevant_api_bases
+
        # sort daily entries by date
        _daily_entries = dict(sorted(_daily_entries.items(), key=lambda item: item[0]))
        for day in _daily_entries:
            entry = {"date": str(day)}
            for model_key, latency in _daily_entries[day].items():
                entry[model_key] = latency
            response.append(entry)
        return {
            "data": response,
            "all_api_bases": list(_all_api_bases),
        }
@router.get(
    "/model/metrics/slow_responses",
    description="View number of hanging requests per model_group",
    tags=["model management"],
    include_in_schema=False,
    dependencies=[Depends(user_api_key_auth)],
 )
 async def model_metrics_slow_responses(
    user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
    _selected_model_group: Optional[str] = "gpt-4-32k",
    startTime: Optional[datetime] = None,
    endTime: Optional[datetime] = None,
 ):
    global prisma_client, llm_router, proxy_logging_obj
    if prisma_client is None:
        raise ProxyException(
            message="Prisma Client is not initialized",
            type="internal_error",
            param="None",
            code=status.HTTP_500_INTERNAL_SERVER_ERROR,
        )
-    else:
+    startTime = startTime or datetime.now() - timedelta(days=30)
    endTime = endTime or datetime.now()
-        sql_query = """
+    alerting_threshold = (
-            SELECT
+        proxy_logging_obj.slack_alerting_instance.alerting_threshold or 300
-                CASE WHEN api_base = '' THEN model ELSE CONCAT(model, '-', api_base) END AS combined_model_api_base,
+    )
-                COUNT(*) AS num_requests,
+    alerting_threshold = int(alerting_threshold)
-                AVG(EXTRACT(epoch FROM ("endTime" - "startTime"))) AS avg_latency_seconds
+
-            FROM
+    sql_query = """
-                "LiteLLM_SpendLogs"
+SELECT
    api_base,
    COUNT(*) AS total_count,
    SUM(CASE
        WHEN ("endTime" - "startTime") >= (INTERVAL '1 SECOND' * CAST($1 AS INTEGER)) THEN 1
        ELSE 0
    END) AS slow_count
 FROM
    "LiteLLM_SpendLogs"
 WHERE
    "model" = $2
    AND "cache_hit" != 'True'
 GROUP BY
    api_base
 ORDER BY
    slow_count DESC;
    """
    db_response = await prisma_client.db.query_raw(
        sql_query, alerting_threshold, _selected_model_group
    )
    if db_response is not None:
        for row in db_response:
            _api_base = row.get("api_base") or ""
            if "/openai/" in _api_base:
                _api_base = _api_base.split("/openai/")[0]
            row["api_base"] = _api_base
    return db_response
@router.get(
    "/model/metrics/exceptions",
    description="View number of failed requests per model on config.yaml",
    tags=["model management"],
    include_in_schema=False,
    dependencies=[Depends(user_api_key_auth)],
 )
 async def model_metrics_exceptions(
    user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
    _selected_model_group: Optional[str] = None,
    startTime: Optional[datetime] = None,
    endTime: Optional[datetime] = None,
 ):
    global prisma_client, llm_router
    if prisma_client is None:
        raise ProxyException(
            message="Prisma Client is not initialized",
            type="internal_error",
            param="None",
            code=status.HTTP_500_INTERNAL_SERVER_ERROR,
        )
    startTime = startTime or datetime.now() - timedelta(days=30)
    endTime = endTime or datetime.now()
    """
    """
    sql_query = """
        WITH cte AS (
            SELECT 
                CASE WHEN api_base = '' THEN litellm_model_name ELSE CONCAT(litellm_model_name, '-', api_base) END AS combined_model_api_base,
                exception_type,
                COUNT(*) AS num_exceptions
            FROM "LiteLLM_ErrorLogs"
            WHERE "startTime" >= $1::timestamp AND "endTime" <= $2::timestamp
-            GROUP BY
+            GROUP BY combined_model_api_base, exception_type
-                CASE WHEN api_base = '' THEN model ELSE CONCAT(model, '-', api_base) END
+        )
-            ORDER BY
+        SELECT 
-                num_requests DESC
+            combined_model_api_base,
-            LIMIT 50;
+            COUNT(*) AS total_exceptions,
-        """
+            json_object_agg(exception_type, num_exceptions) AS exception_counts
-
+        FROM cte
-        db_response = await prisma_client.db.query_raw(sql_query, startTime, endTime)
+        GROUP BY combined_model_api_base
        ORDER BY total_exceptions DESC
        LIMIT 200;
    """
    db_response = await prisma_client.db.query_raw(sql_query, startTime, endTime)
    response: List[dict] = []
-    if response is not None:
+    exception_types = set()
    """
    Return Data
    {
        "combined_model_api_base": "gpt-3.5-turbo-https://api.openai.com/v1/,
        "total_exceptions": 5,
        "BadRequestException": 5,
        "TimeoutException": 2
    }
    """
    if db_response is not None:
        # loop through all models
        for model_data in db_response:
            model = model_data.get("combined_model_api_base", "")
-            num_requests = model_data.get("num_requests", 0)
+            total_exceptions = model_data.get("total_exceptions", 0)
-            avg_latency_seconds = model_data.get("avg_latency_seconds", 0)
+            exception_counts = model_data.get("exception_counts", {})
-            response.append(
+            curr_row = {
-                {
+                "model": model,
-                    "model": model,
+                "total_exceptions": total_exceptions,
-                    "num_requests": num_requests,
+            }
-                    "avg_latency_seconds": avg_latency_seconds,
+            curr_row.update(exception_counts)
-                }
+            response.append(curr_row)
-            )
+            for k, v in exception_counts.items():
-    return response
+                exception_types.add(k)
    return {"data": response, "exception_types": list(exception_types)}
@router.get(
@ -8453,6 +8706,13 @@ async def update_config(config_info: ConfigYAML):
            _existing_settings = config["general_settings"]
            for k, v in updated_general_settings.items():
                # overwrite existing settings with updated values
                if k == "alert_to_webhook_url":
                    # check if slack is already enabled. if not, enable it
                    if "slack" not in _existing_settings:
                        if "alerting" not in _existing_settings:
                            _existing_settings["alerting"] = ["slack"]
                        elif isinstance(_existing_settings["alerting"], list):
                            _existing_settings["alerting"].append("slack")
                _existing_settings[k] = v
            config["general_settings"] = _existing_settings
@ -8567,7 +8827,25 @@ async def get_config():
        """
        for _callback in _success_callbacks:
-            if _callback == "langfuse":
+            if _callback == "openmeter":
                env_vars = [
                    "OPENMETER_API_KEY",
                ]
                env_vars_dict = {}
                for _var in env_vars:
                    env_variable = environment_variables.get(_var, None)
                    if env_variable is None:
                        env_vars_dict[_var] = None
                    else:
                        # decode + decrypt the value
                        decoded_b64 = base64.b64decode(env_variable)
                        _decrypted_value = decrypt_value(
                            value=decoded_b64, master_key=master_key
                        )
                        env_vars_dict[_var] = _decrypted_value
                _data_to_return.append({"name": _callback, "variables": env_vars_dict})
            elif _callback == "langfuse":
                _langfuse_vars = [
                    "LANGFUSE_PUBLIC_KEY",
                    "LANGFUSE_SECRET_KEY",
@ -8592,6 +8870,7 @@ async def get_config():
        # Check if slack alerting is on
        _alerting = _general_settings.get("alerting", [])
        alerting_data = []
        if "slack" in _alerting:
            _slack_vars = [
                "SLACK_WEBHOOK_URL",
@ -8600,7 +8879,8 @@ async def get_config():
            for _var in _slack_vars:
                env_variable = environment_variables.get(_var, None)
                if env_variable is None:
-                    _slack_env_vars[_var] = None
+                    _value = os.getenv("SLACK_WEBHOOK_URL", None)
                    _slack_env_vars[_var] = _value
                else:
                    # decode + decrypt the value
                    decoded_b64 = base64.b64decode(env_variable)
@ -8613,19 +8893,23 @@ async def get_config():
            _all_alert_types = (
                proxy_logging_obj.slack_alerting_instance._all_possible_alert_types()
            )
-            _data_to_return.append(
+            _alerts_to_webhook = (
                proxy_logging_obj.slack_alerting_instance.alert_to_webhook_url
            )
            alerting_data.append(
                {
                    "name": "slack",
                    "variables": _slack_env_vars,
-                    "alerting_types": _alerting_types,
+                    "active_alerts": _alerting_types,
-                    "all_alert_types": _all_alert_types,
+                    "alerts_to_webhook": _alerts_to_webhook,
                }
            )
        _router_settings = llm_router.get_settings()
        return {
            "status": "success",
-            "data": _data_to_return,
+            "callbacks": _data_to_return,
            "alerts": alerting_data,
            "router_settings": _router_settings,
        }
    except Exception as e:
@ -8701,9 +8985,9 @@ async def test_endpoint(request: Request):
 )
 async def health_services_endpoint(
    user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
-    service: Literal["slack_budget_alerts", "langfuse", "slack"] = fastapi.Query(
+    service: Literal[
-        description="Specify the service being hit."
+        "slack_budget_alerts", "langfuse", "slack", "openmeter"
-    ),
+    ] = fastapi.Query(description="Specify the service being hit."),
 ):
    """
    Hidden endpoint.
@ -8717,7 +9001,7 @@ async def health_services_endpoint(
            raise HTTPException(
                status_code=400, detail={"error": "Service must be specified."}
            )
-        if service not in ["slack_budget_alerts", "langfuse", "slack"]:
+        if service not in ["slack_budget_alerts", "langfuse", "slack", "openmeter"]:
            raise HTTPException(
                status_code=400,
                detail={
@ -8725,6 +9009,18 @@ async def health_services_endpoint(
                },
            )
        if service == "openmeter":
            _ = await litellm.acompletion(
                model="openai/litellm-mock-response-model",
                messages=[{"role": "user", "content": "Hey, how's it going?"}],
                user="litellm:/health/services",
                mock_response="This is a mock response",
            )
            return {
                "status": "success",
                "message": "Mock LLM request made - check openmeter.",
            }
        if service == "langfuse":
            from litellm.integrations.langfuse import LangFuseLogger
@ -8741,27 +9037,73 @@ async def health_services_endpoint(
                "message": "Mock LLM request made - check langfuse.",
            }
-        if "slack" in general_settings.get("alerting", []):
+        if service == "slack" or service == "slack_budget_alerts":
-            test_message = f"""\n🚨 `ProjectedLimitExceededError` 💸\n\n`Key Alias:` litellm-ui-test-alert \n`Expected Day of Error`: 28th March \n`Current Spend`: $100.00 \n`Projected Spend at end of month`: $1000.00 \n`Soft Limit`: $700"""
+            if "slack" in general_settings.get("alerting", []):
-            await proxy_logging_obj.alerting_handler(message=test_message, level="Low")
+                # test_message = f"""\n🚨 `ProjectedLimitExceededError` 💸\n\n`Key Alias:` litellm-ui-test-alert \n`Expected Day of Error`: 28th March \n`Current Spend`: $100.00 \n`Projected Spend at end of month`: $1000.00 \n`Soft Limit`: $700"""
-            return {
+                # check if user has opted into unique_alert_webhooks
-                "status": "success",
+                if (
-                "message": "Mock Slack Alert sent, verify Slack Alert Received on your channel",
+                    proxy_logging_obj.slack_alerting_instance.alert_to_webhook_url
-            }
+                    is not None
-        else:
+                ):
-            raise HTTPException(
+                    for (
-                status_code=422,
+                        alert_type
-                detail={
+                    ) in proxy_logging_obj.slack_alerting_instance.alert_to_webhook_url:
-                    "error": '"slack" not in proxy config: general_settings. Unable to test this.'
+                        """
-                },
+                        "llm_exceptions",
-            )
+                        "llm_too_slow",
                        "llm_requests_hanging",
                        "budget_alerts",
                        "db_exceptions",
                        """
                        # only test alert if it's in active alert types
                        if (
                            proxy_logging_obj.slack_alerting_instance.alert_types
                            is not None
                            and alert_type
                            not in proxy_logging_obj.slack_alerting_instance.alert_types
                        ):
                            continue
                        test_message = "default test message"
                        if alert_type == "llm_exceptions":
                            test_message = f"LLM Exception test alert"
                        elif alert_type == "llm_too_slow":
                            test_message = f"LLM Too Slow test alert"
                        elif alert_type == "llm_requests_hanging":
                            test_message = f"LLM Requests Hanging test alert"
                        elif alert_type == "budget_alerts":
                            test_message = f"Budget Alert test alert"
                        elif alert_type == "db_exceptions":
                            test_message = f"DB Exception test alert"
                        await proxy_logging_obj.alerting_handler(
                            message=test_message, level="Low", alert_type=alert_type
                        )
                else:
                    await proxy_logging_obj.alerting_handler(
                        message="This is a test slack alert message",
                        level="Low",
                        alert_type="budget_alerts",
                    )
                return {
                    "status": "success",
                    "message": "Mock Slack Alert sent, verify Slack Alert Received on your channel",
                }
            else:
                raise HTTPException(
                    status_code=422,
                    detail={
                        "error": '"{}" not in proxy config: general_settings. Unable to test this.'.format(
                            service
                        )
                    },
                )
    except Exception as e:
        if isinstance(e, HTTPException):
            raise ProxyException(
                message=getattr(e, "detail", f"Authentication Error({str(e)})"),
                type="auth_error",
                param=getattr(e, "param", "None"),
-                code=getattr(e, "status_code", status.HTTP_401_UNAUTHORIZED),
+                code=getattr(e, "status_code", status.HTTP_500_INTERNAL_SERVER_ERROR),
            )
        elif isinstance(e, ProxyException):
            raise e
@ -8769,7 +9111,7 @@ async def health_services_endpoint(
            message="Authentication Error, " + str(e),
            type="auth_error",
            param=getattr(e, "param", "None"),
-            code=status.HTTP_401_UNAUTHORIZED,
+            code=status.HTTP_500_INTERNAL_SERVER_ERROR,
        )
--- a/litellm/proxy/schema.prisma
+++ b/litellm/proxy/schema.prisma
@ -183,6 +183,21 @@ model LiteLLM_SpendLogs {
  end_user            String?
 }
 // View spend, model, api_key per request
 model LiteLLM_ErrorLogs {
  request_id          String   @id @default(uuid())
  startTime           DateTime // Assuming start_time is a DateTime field
  endTime             DateTime // Assuming end_time is a DateTime field
  api_base            String   @default("") 
  model_group         String   @default("")      // public model_name / model_group
  litellm_model_name  String   @default("")      // model passed to litellm
  model_id            String   @default("")      // ID of model in ProxyModelTable
  request_kwargs      Json     @default("{}")
  exception_type      String   @default("")
  exception_string    String   @default("")
  status_code         String   @default("")
 }
 // Beta - allow team members to request access to a model
 model LiteLLM_UserNotifications {
  request_id          String @id
--- a/litellm/proxy/utils.py
+++ b/litellm/proxy/utils.py
@ -1,6 +1,6 @@
 from typing import Optional, List, Any, Literal, Union
 import os, subprocess, hashlib, importlib, asyncio, copy, json, aiohttp, httpx, time
-import litellm, backoff
+import litellm, backoff, traceback
 from litellm.proxy._types import (
    UserAPIKeyAuth,
    DynamoDBArgs,
@ -199,6 +199,33 @@ class ProxyLogging:
            print_verbose(f"final data being sent to {call_type} call: {data}")
            return data
        except Exception as e:
            if "litellm_logging_obj" in data:
                logging_obj: litellm.utils.Logging = data["litellm_logging_obj"]
                ## ASYNC FAILURE HANDLER ##
                error_message = ""
                if isinstance(e, HTTPException):
                    if isinstance(e.detail, str):
                        error_message = e.detail
                    elif isinstance(e.detail, dict):
                        error_message = json.dumps(e.detail)
                    else:
                        error_message = str(e)
                else:
                    error_message = str(e)
                error_raised = Exception(f"{error_message}")
                await logging_obj.async_failure_handler(
                    exception=error_raised,
                    traceback_exception=traceback.format_exc(),
                )
                ## SYNC FAILURE HANDLER ##
                try:
                    logging_obj.failure_handler(
                        error_raised, traceback.format_exc()
                    )  # DO NOT MAKE THREADED - router retry fallback relies on this!
                except Exception as error_val:
                    pass
            raise e
    async def during_call_hook(
@ -256,7 +283,16 @@ class ProxyLogging:
        )
    async def alerting_handler(
-        self, message: str, level: Literal["Low", "Medium", "High"]
+        self,
        message: str,
        level: Literal["Low", "Medium", "High"],
        alert_type: Literal[
            "llm_exceptions",
            "llm_too_slow",
            "llm_requests_hanging",
            "budget_alerts",
            "db_exceptions",
        ],
    ):
        """
        Alerting based on thresholds: - https://github.com/BerriAI/litellm/issues/1298
@ -289,7 +325,7 @@ class ProxyLogging:
        for client in self.alerting:
            if client == "slack":
                await self.slack_alerting_instance.send_alert(
-                    message=message, level=level
+                    message=message, level=level, alert_type=alert_type
                )
            elif client == "sentry":
                if litellm.utils.sentry_sdk_instance is not None:
@ -323,6 +359,7 @@ class ProxyLogging:
            self.alerting_handler(
                message=f"DB read/write call failed: {error_message}",
                level="High",
                alert_type="db_exceptions",
            )
        )
@ -354,7 +391,9 @@ class ProxyLogging:
            return
        asyncio.create_task(
            self.alerting_handler(
-                message=f"LLM API call failed: {str(original_exception)}", level="High"
+                message=f"LLM API call failed: {str(original_exception)}",
                level="High",
                alert_type="llm_exceptions",
            )
        )
@ -1738,7 +1777,7 @@ def get_logging_payload(kwargs, response_obj, start_time, end_time):
    usage = response_obj["usage"]
    if type(usage) == litellm.Usage:
        usage = dict(usage)
-    id = response_obj.get("id", str(uuid.uuid4()))
+    id = response_obj.get("id", kwargs.get("litellm_call_id"))
    api_key = metadata.get("user_api_key", "")
    if api_key is not None and isinstance(api_key, str) and api_key.startswith("sk-"):
        # hash the api_key
@ -2010,6 +2049,11 @@ async def update_spend(
                raise e
    ### UPDATE KEY TABLE ###
    verbose_proxy_logger.debug(
        "KEY Spend transactions: {}".format(
            len(prisma_client.key_list_transactons.keys())
        )
    )
    if len(prisma_client.key_list_transactons.keys()) > 0:
        for i in range(n_retry_times + 1):
            start_time = time.time()
--- a/litellm/router.py
+++ b/litellm/router.py
@ -50,7 +50,6 @@ class Router:
    model_names: List = []
    cache_responses: Optional[bool] = False
    default_cache_time_seconds: int = 1 * 60 * 60  # 1 hour
    num_retries: int = 0
    tenacity = None
    leastbusy_logger: Optional[LeastBusyLoggingHandler] = None
    lowesttpm_logger: Optional[LowestTPMLoggingHandler] = None
@ -70,9 +69,11 @@ class Router:
        ] = None,  # if you want to cache across model groups
        client_ttl: int = 3600,  # ttl for cached clients - will re-initialize after this time in seconds
        ## RELIABILITY ##
-        num_retries: int = 0,
+        num_retries: Optional[int] = None,
        timeout: Optional[float] = None,
-        default_litellm_params={},  # default params for Router.chat.completion.create
+        default_litellm_params: Optional[
            dict
        ] = None,  # default params for Router.chat.completion.create
        default_max_parallel_requests: Optional[int] = None,
        set_verbose: bool = False,
        debug_level: Literal["DEBUG", "INFO"] = "INFO",
@ -158,6 +159,7 @@ class Router:
        router = Router(model_list=model_list, fallbacks=[{"azure-gpt-3.5-turbo": "openai-gpt-3.5-turbo"}])
        ```
        """
        if semaphore:
            self.semaphore = semaphore
        self.set_verbose = set_verbose
@ -229,7 +231,14 @@ class Router:
        self.failed_calls = (
            InMemoryCache()
        )  # cache to track failed call per deployment, if num failed calls within 1 minute > allowed fails, then add it to cooldown
-        self.num_retries = num_retries or litellm.num_retries or 0
+
        if num_retries is not None:
            self.num_retries = num_retries
        elif litellm.num_retries is not None:
            self.num_retries = litellm.num_retries
        else:
            self.num_retries = openai.DEFAULT_MAX_RETRIES
        self.timeout = timeout or litellm.request_timeout
        self.retry_after = retry_after
@ -255,6 +264,7 @@ class Router:
        )  # dict to store aliases for router, ex. {"gpt-4": "gpt-3.5-turbo"}, all requests with gpt-4 -> get routed to gpt-3.5-turbo group
        # make Router.chat.completions.create compatible for openai.chat.completions.create
        default_litellm_params = default_litellm_params or {}
        self.chat = litellm.Chat(params=default_litellm_params, router_obj=self)
        # default litellm args
@ -280,6 +290,21 @@ class Router:
        }
        """
        ### ROUTING SETUP ###
        self.routing_strategy_init(
            routing_strategy=routing_strategy,
            routing_strategy_args=routing_strategy_args,
        )
        ## COOLDOWNS ##
        if isinstance(litellm.failure_callback, list):
            litellm.failure_callback.append(self.deployment_callback_on_failure)
        else:
            litellm.failure_callback = [self.deployment_callback_on_failure]
        print(  # noqa
            f"Intialized router with Routing strategy: {self.routing_strategy}\n\nRouting fallbacks: {self.fallbacks}\n\nRouting context window fallbacks: {self.context_window_fallbacks}\n\nRouter Redis Caching={self.cache.redis_cache}"
        )  # noqa
        self.routing_strategy_args = routing_strategy_args
    def routing_strategy_init(self, routing_strategy: str, routing_strategy_args: dict):
        if routing_strategy == "least-busy":
            self.leastbusy_logger = LeastBusyLoggingHandler(
                router_cache=self.cache, model_list=self.model_list
@ -311,15 +336,6 @@ class Router:
            )
            if isinstance(litellm.callbacks, list):
                litellm.callbacks.append(self.lowestlatency_logger)  # type: ignore
        ## COOLDOWNS ##
        if isinstance(litellm.failure_callback, list):
            litellm.failure_callback.append(self.deployment_callback_on_failure)
        else:
            litellm.failure_callback = [self.deployment_callback_on_failure]
        verbose_router_logger.info(
            f"Intialized router with Routing strategy: {self.routing_strategy}\n\nRouting fallbacks: {self.fallbacks}\n\nRouting context window fallbacks: {self.context_window_fallbacks}\n\nRouter Redis Caching={self.cache.redis_cache}"
        )
        self.routing_strategy_args = routing_strategy_args
    def print_deployment(self, deployment: dict):
        """
@ -428,6 +444,7 @@ class Router:
            kwargs["messages"] = messages
            kwargs["original_function"] = self._acompletion
            kwargs["num_retries"] = kwargs.get("num_retries", self.num_retries)
            timeout = kwargs.get("request_timeout", self.timeout)
            kwargs.setdefault("metadata", {}).update({"model_group": model})
@ -469,6 +486,7 @@ class Router:
            )
            kwargs["model_info"] = deployment.get("model_info", {})
            data = deployment["litellm_params"].copy()
            model_name = data["model"]
            for k, v in self.default_litellm_params.items():
                if (
@ -1415,10 +1433,12 @@ class Router:
        context_window_fallbacks = kwargs.pop(
            "context_window_fallbacks", self.context_window_fallbacks
        )
-        verbose_router_logger.debug(
+
            f"async function w/ retries: original_function - {original_function}"
        )
        num_retries = kwargs.pop("num_retries")
        verbose_router_logger.debug(
            f"async function w/ retries: original_function - {original_function}, num_retries - {num_retries}"
        )
        try:
            # if the function call is successful, no exception will be raised and we'll break out of the loop
            response = await original_function(*args, **kwargs)
@ -1436,37 +1456,47 @@ class Router:
                raise original_exception
            ### RETRY
            #### check if it should retry + back-off if required
-            if "No models available" in str(e):
+            # if "No models available" in str(
-                timeout = litellm._calculate_retry_after(
+            #     e
-                    remaining_retries=num_retries,
+            # ) or RouterErrors.no_deployments_available.value in str(e):
-                    max_retries=num_retries,
+            #     timeout = litellm._calculate_retry_after(
-                    min_timeout=self.retry_after,
+            #         remaining_retries=num_retries,
-                )
+            #         max_retries=num_retries,
-                await asyncio.sleep(timeout)
+            #         min_timeout=self.retry_after,
-            elif RouterErrors.user_defined_ratelimit_error.value in str(e):
+            #     )
-                raise e  # don't wait to retry if deployment hits user-defined rate-limit
+            #     await asyncio.sleep(timeout)
-            elif hasattr(original_exception, "status_code") and litellm._should_retry(
+            # elif RouterErrors.user_defined_ratelimit_error.value in str(e):
-                status_code=original_exception.status_code
+            #     raise e  # don't wait to retry if deployment hits user-defined rate-limit
            ):
                if hasattr(original_exception, "response") and hasattr(
                    original_exception.response, "headers"
                ):
                    timeout = litellm._calculate_retry_after(
                        remaining_retries=num_retries,
                        max_retries=num_retries,
                        response_headers=original_exception.response.headers,
                        min_timeout=self.retry_after,
                    )
                else:
                    timeout = litellm._calculate_retry_after(
                        remaining_retries=num_retries,
                        max_retries=num_retries,
                        min_timeout=self.retry_after,
                    )
                await asyncio.sleep(timeout)
            else:
                raise original_exception
            # elif hasattr(original_exception, "status_code") and litellm._should_retry(
            #     status_code=original_exception.status_code
            # ):
            #     if hasattr(original_exception, "response") and hasattr(
            #         original_exception.response, "headers"
            #     ):
            #         timeout = litellm._calculate_retry_after(
            #             remaining_retries=num_retries,
            #             max_retries=num_retries,
            #             response_headers=original_exception.response.headers,
            #             min_timeout=self.retry_after,
            #         )
            #     else:
            #         timeout = litellm._calculate_retry_after(
            #             remaining_retries=num_retries,
            #             max_retries=num_retries,
            #             min_timeout=self.retry_after,
            #         )
            #     await asyncio.sleep(timeout)
            # else:
            #     raise original_exception
            ### RETRY
            _timeout = self._router_should_retry(
                e=original_exception,
                remaining_retries=num_retries,
                num_retries=num_retries,
            )
            await asyncio.sleep(_timeout)
            ## LOGGING
            if num_retries > 0:
                kwargs = self.log_retry(kwargs=kwargs, e=original_exception)
@ -1488,34 +1518,12 @@ class Router:
                    ## LOGGING
                    kwargs = self.log_retry(kwargs=kwargs, e=e)
                    remaining_retries = num_retries - current_attempt
-                    if "No models available" in str(e):
+                    _timeout = self._router_should_retry(
-                        timeout = litellm._calculate_retry_after(
+                        e=original_exception,
-                            remaining_retries=remaining_retries,
+                        remaining_retries=remaining_retries,
-                            max_retries=num_retries,
+                        num_retries=num_retries,
-                            min_timeout=self.retry_after,
+                    )
-                        )
+                    await asyncio.sleep(_timeout)
                        await asyncio.sleep(timeout)
                    elif (
                        hasattr(e, "status_code")
                        and hasattr(e, "response")
                        and litellm._should_retry(status_code=e.status_code)
                    ):
                        if hasattr(e.response, "headers"):
                            timeout = litellm._calculate_retry_after(
                                remaining_retries=remaining_retries,
                                max_retries=num_retries,
                                response_headers=e.response.headers,
                                min_timeout=self.retry_after,
                            )
                        else:
                            timeout = litellm._calculate_retry_after(
                                remaining_retries=remaining_retries,
                                max_retries=num_retries,
                                min_timeout=self.retry_after,
                            )
                        await asyncio.sleep(timeout)
                    else:
                        raise e
            raise original_exception
    def function_with_fallbacks(self, *args, **kwargs):
@ -1606,6 +1614,27 @@ class Router:
                raise e
            raise original_exception
    def _router_should_retry(
        self, e: Exception, remaining_retries: int, num_retries: int
    ) -> Union[int, float]:
        """
        Calculate back-off, then retry
        """
        if hasattr(e, "response") and hasattr(e.response, "headers"):
            timeout = litellm._calculate_retry_after(
                remaining_retries=remaining_retries,
                max_retries=num_retries,
                response_headers=e.response.headers,
                min_timeout=self.retry_after,
            )
        else:
            timeout = litellm._calculate_retry_after(
                remaining_retries=remaining_retries,
                max_retries=num_retries,
                min_timeout=self.retry_after,
            )
        return timeout
    def function_with_retries(self, *args, **kwargs):
        """
        Try calling the model 3 times. Shuffle between available deployments.
@ -1619,15 +1648,13 @@ class Router:
        context_window_fallbacks = kwargs.pop(
            "context_window_fallbacks", self.context_window_fallbacks
        )
        try:
            # if the function call is successful, no exception will be raised and we'll break out of the loop
            response = original_function(*args, **kwargs)
            return response
        except Exception as e:
            original_exception = e
            verbose_router_logger.debug(
                f"num retries in function with retries: {num_retries}"
            )
            ### CHECK IF RATE LIMIT / CONTEXT WINDOW ERROR
            if (
                isinstance(original_exception, litellm.ContextWindowExceededError)
@ -1641,6 +1668,12 @@ class Router:
            if num_retries > 0:
                kwargs = self.log_retry(kwargs=kwargs, e=original_exception)
            ### RETRY
            _timeout = self._router_should_retry(
                e=original_exception,
                remaining_retries=num_retries,
                num_retries=num_retries,
            )
            time.sleep(_timeout)
            for current_attempt in range(num_retries):
                verbose_router_logger.debug(
                    f"retrying request. Current attempt - {current_attempt}; retries left: {num_retries}"
@ -1654,34 +1687,12 @@ class Router:
                    ## LOGGING
                    kwargs = self.log_retry(kwargs=kwargs, e=e)
                    remaining_retries = num_retries - current_attempt
-                    if "No models available" in str(e):
+                    _timeout = self._router_should_retry(
-                        timeout = litellm._calculate_retry_after(
+                        e=e,
-                            remaining_retries=remaining_retries,
+                        remaining_retries=remaining_retries,
-                            max_retries=num_retries,
+                        num_retries=num_retries,
-                            min_timeout=self.retry_after,
+                    )
-                        )
+                    time.sleep(_timeout)
                        time.sleep(timeout)
                    elif (
                        hasattr(e, "status_code")
                        and hasattr(e, "response")
                        and litellm._should_retry(status_code=e.status_code)
                    ):
                        if hasattr(e.response, "headers"):
                            timeout = litellm._calculate_retry_after(
                                remaining_retries=remaining_retries,
                                max_retries=num_retries,
                                response_headers=e.response.headers,
                                min_timeout=self.retry_after,
                            )
                        else:
                            timeout = litellm._calculate_retry_after(
                                remaining_retries=remaining_retries,
                                max_retries=num_retries,
                                min_timeout=self.retry_after,
                            )
                        time.sleep(timeout)
                    else:
                        raise e
            raise original_exception
    ### HELPER FUNCTIONS
@ -1715,10 +1726,11 @@ class Router:
            )  # i.e. azure
            metadata = kwargs.get("litellm_params", {}).get("metadata", None)
            _model_info = kwargs.get("litellm_params", {}).get("model_info", {})
            if isinstance(_model_info, dict):
                deployment_id = _model_info.get("id", None)
                self._set_cooldown_deployments(
-                    deployment_id
+                    exception_status=exception_status, deployment=deployment_id
                )  # setting deployment_id in cooldown deployments
            if custom_llm_provider:
                model_name = f"{custom_llm_provider}/{model_name}"
@ -1778,9 +1790,15 @@ class Router:
                key=rpm_key, value=request_count, local_only=True
            )  # don't change existing ttl
-    def _set_cooldown_deployments(self, deployment: Optional[str] = None):
+    def _set_cooldown_deployments(
        self, exception_status: Union[str, int], deployment: Optional[str] = None
    ):
        """
        Add a model to the list of models being cooled down for that minute, if it exceeds the allowed fails / minute
        or
        the exception is not one that should be immediately retried (e.g. 401)
        """
        if deployment is None:
            return
@ -1797,7 +1815,20 @@ class Router:
            f"Attempting to add {deployment} to cooldown list. updated_fails: {updated_fails}; self.allowed_fails: {self.allowed_fails}"
        )
        cooldown_time = self.cooldown_time or 1
-        if updated_fails > self.allowed_fails:
+
        if isinstance(exception_status, str):
            try:
                exception_status = int(exception_status)
            except Exception as e:
                verbose_router_logger.debug(
                    "Unable to cast exception status to int {}. Defaulting to status=500.".format(
                        exception_status
                    )
                )
                exception_status = 500
        _should_retry = litellm._should_retry(status_code=exception_status)
        if updated_fails > self.allowed_fails or _should_retry == False:
            # get the current cooldown list for that minute
            cooldown_key = f"{current_minute}:cooldown_models"  # group cooldown models by minute to reduce number of redis calls
            cached_value = self.cache.get_cache(key=cooldown_key)
@ -1929,6 +1960,7 @@ class Router:
            )
            default_api_base = api_base
            default_api_key = api_key
        if (
            model_name in litellm.open_ai_chat_completion_models
            or custom_llm_provider in litellm.openai_compatible_providers
@ -1940,8 +1972,10 @@ class Router:
            or "ft:gpt-3.5-turbo" in model_name
            or model_name in litellm.open_ai_embedding_models
        ):
            is_azure_ai_studio_model: bool = False
            if custom_llm_provider == "azure":
                if litellm.utils._is_non_openai_azure_model(model_name):
                    is_azure_ai_studio_model = True
                    custom_llm_provider = "openai"
                    # remove azure prefx from model_name
                    model_name = model_name.replace("azure/", "")
@ -1964,6 +1998,25 @@ class Router:
                api_base = litellm.get_secret(api_base_env_name)
                litellm_params["api_base"] = api_base
            ## AZURE AI STUDIO MISTRAL CHECK ##
            """
            Make sure api base ends in /v1/
            if not, add it - https://github.com/BerriAI/litellm/issues/2279
            """
            if (
                is_azure_ai_studio_model == True
                and api_base is not None
                and not api_base.endswith("/v1/")
            ):
                # check if it ends with a trailing slash
                if api_base.endswith("/"):
                    api_base += "v1/"
                elif api_base.endswith("/v1"):
                    api_base += "/"
                else:
                    api_base += "/v1/"
            api_version = litellm_params.get("api_version")
            if api_version and api_version.startswith("os.environ/"):
                api_version_env_name = api_version.replace("os.environ/", "")
@ -1986,7 +2039,9 @@ class Router:
                stream_timeout = litellm.get_secret(stream_timeout_env_name)
                litellm_params["stream_timeout"] = stream_timeout
-            max_retries = litellm_params.pop("max_retries", 2)
+            max_retries = litellm_params.pop(
                "max_retries", 0
            )  # router handles retry logic
            if isinstance(max_retries, str) and max_retries.startswith("os.environ/"):
                max_retries_env_name = max_retries.replace("os.environ/", "")
                max_retries = litellm.get_secret(max_retries_env_name)
@ -2052,9 +2107,11 @@ class Router:
                        timeout=timeout,
                        max_retries=max_retries,
                        http_client=httpx.AsyncClient(
-                            transport=AsyncCustomHTTPTransport(),
+                            transport=AsyncCustomHTTPTransport(
-                            limits=httpx.Limits(
+                                limits=httpx.Limits(
-                                max_connections=1000, max_keepalive_connections=100
+                                    max_connections=1000, max_keepalive_connections=100
                                ),
                                verify=litellm.ssl_verify,
                            ),
                            mounts=async_proxy_mounts,
                        ),  # type: ignore
@ -2074,9 +2131,11 @@ class Router:
                        timeout=timeout,
                        max_retries=max_retries,
                        http_client=httpx.Client(
-                            transport=CustomHTTPTransport(),
+                            transport=CustomHTTPTransport(
-                            limits=httpx.Limits(
+                                limits=httpx.Limits(
-                                max_connections=1000, max_keepalive_connections=100
+                                    max_connections=1000, max_keepalive_connections=100
                                ),
                                verify=litellm.ssl_verify,
                            ),
                            mounts=sync_proxy_mounts,
                        ),  # type: ignore
@ -2096,9 +2155,11 @@ class Router:
                        timeout=stream_timeout,
                        max_retries=max_retries,
                        http_client=httpx.AsyncClient(
-                            transport=AsyncCustomHTTPTransport(),
+                            transport=AsyncCustomHTTPTransport(
-                            limits=httpx.Limits(
+                                limits=httpx.Limits(
-                                max_connections=1000, max_keepalive_connections=100
+                                    max_connections=1000, max_keepalive_connections=100
                                ),
                                verify=litellm.ssl_verify,
                            ),
                            mounts=async_proxy_mounts,
                        ),  # type: ignore
@ -2118,9 +2179,11 @@ class Router:
                        timeout=stream_timeout,
                        max_retries=max_retries,
                        http_client=httpx.Client(
-                            transport=CustomHTTPTransport(),
+                            transport=CustomHTTPTransport(
-                            limits=httpx.Limits(
+                                limits=httpx.Limits(
-                                max_connections=1000, max_keepalive_connections=100
+                                    max_connections=1000, max_keepalive_connections=100
                                ),
                                verify=litellm.ssl_verify,
                            ),
                            mounts=sync_proxy_mounts,
                        ),  # type: ignore
@ -2158,9 +2221,11 @@ class Router:
                        timeout=timeout,
                        max_retries=max_retries,
                        http_client=httpx.AsyncClient(
-                            transport=AsyncCustomHTTPTransport(),
+                            transport=AsyncCustomHTTPTransport(
-                            limits=httpx.Limits(
+                                limits=httpx.Limits(
-                                max_connections=1000, max_keepalive_connections=100
+                                    max_connections=1000, max_keepalive_connections=100
                                ),
                                verify=litellm.ssl_verify,
                            ),
                            mounts=async_proxy_mounts,
                        ),  # type: ignore
@ -2178,9 +2243,11 @@ class Router:
                        timeout=timeout,
                        max_retries=max_retries,
                        http_client=httpx.Client(
-                            transport=CustomHTTPTransport(),
+                            transport=CustomHTTPTransport(
-                            limits=httpx.Limits(
+                                verify=litellm.ssl_verify,
-                                max_connections=1000, max_keepalive_connections=100
+                                limits=httpx.Limits(
                                    max_connections=1000, max_keepalive_connections=100
                                ),
                            ),
                            mounts=sync_proxy_mounts,
                        ),  # type: ignore
@ -2199,9 +2266,11 @@ class Router:
                        timeout=stream_timeout,
                        max_retries=max_retries,
                        http_client=httpx.AsyncClient(
-                            transport=AsyncCustomHTTPTransport(),
+                            transport=AsyncCustomHTTPTransport(
-                            limits=httpx.Limits(
+                                limits=httpx.Limits(
-                                max_connections=1000, max_keepalive_connections=100
+                                    max_connections=1000, max_keepalive_connections=100
                                ),
                                verify=litellm.ssl_verify,
                            ),
                            mounts=async_proxy_mounts,
                        ),
@ -2219,9 +2288,11 @@ class Router:
                        timeout=stream_timeout,
                        max_retries=max_retries,
                        http_client=httpx.Client(
-                            transport=CustomHTTPTransport(),
+                            transport=CustomHTTPTransport(
-                            limits=httpx.Limits(
+                                limits=httpx.Limits(
-                                max_connections=1000, max_keepalive_connections=100
+                                    max_connections=1000, max_keepalive_connections=100
                                ),
                                verify=litellm.ssl_verify,
                            ),
                            mounts=sync_proxy_mounts,
                        ),
@ -2249,9 +2320,11 @@ class Router:
                    max_retries=max_retries,
                    organization=organization,
                    http_client=httpx.AsyncClient(
-                        transport=AsyncCustomHTTPTransport(),
+                        transport=AsyncCustomHTTPTransport(
-                        limits=httpx.Limits(
+                            limits=httpx.Limits(
-                            max_connections=1000, max_keepalive_connections=100
+                                max_connections=1000, max_keepalive_connections=100
                            ),
                            verify=litellm.ssl_verify,
                        ),
                        mounts=async_proxy_mounts,
                    ),  # type: ignore
@ -2271,9 +2344,11 @@ class Router:
                    max_retries=max_retries,
                    organization=organization,
                    http_client=httpx.Client(
-                        transport=CustomHTTPTransport(),
+                        transport=CustomHTTPTransport(
-                        limits=httpx.Limits(
+                            limits=httpx.Limits(
-                            max_connections=1000, max_keepalive_connections=100
+                                max_connections=1000, max_keepalive_connections=100
                            ),
                            verify=litellm.ssl_verify,
                        ),
                        mounts=sync_proxy_mounts,
                    ),  # type: ignore
@ -2294,9 +2369,11 @@ class Router:
                    max_retries=max_retries,
                    organization=organization,
                    http_client=httpx.AsyncClient(
-                        transport=AsyncCustomHTTPTransport(),
+                        transport=AsyncCustomHTTPTransport(
-                        limits=httpx.Limits(
+                            limits=httpx.Limits(
-                            max_connections=1000, max_keepalive_connections=100
+                                max_connections=1000, max_keepalive_connections=100
                            ),
                            verify=litellm.ssl_verify,
                        ),
                        mounts=async_proxy_mounts,
                    ),  # type: ignore
@ -2317,9 +2394,11 @@ class Router:
                    max_retries=max_retries,
                    organization=organization,
                    http_client=httpx.Client(
-                        transport=CustomHTTPTransport(),
+                        transport=CustomHTTPTransport(
-                        limits=httpx.Limits(
+                            limits=httpx.Limits(
-                            max_connections=1000, max_keepalive_connections=100
+                                max_connections=1000, max_keepalive_connections=100
                            ),
                            verify=litellm.ssl_verify,
                        ),
                        mounts=sync_proxy_mounts,
                    ),  # type: ignore
@ -2550,6 +2629,11 @@ class Router:
        for var in vars_to_include:
            if var in _all_vars:
                _settings_to_return[var] = _all_vars[var]
            if (
                var == "routing_strategy_args"
                and self.routing_strategy == "latency-based-routing"
            ):
                _settings_to_return[var] = self.lowestlatency_logger.routing_args.json()
        return _settings_to_return
    def update_settings(self, **kwargs):
@ -2581,6 +2665,13 @@ class Router:
                    _casted_value = int(kwargs[var])
                    setattr(self, var, _casted_value)
                else:
                    if var == "routing_strategy":
                        self.routing_strategy_init(
                            routing_strategy=kwargs[var],
                            routing_strategy_args=kwargs.get(
                                "routing_strategy_args", {}
                            ),
                        )
                    setattr(self, var, kwargs[var])
            else:
                verbose_router_logger.debug("Setting {} is not allowed".format(var))
@ -2717,7 +2808,10 @@ class Router:
                self.cache.get_cache(key=model_id, local_only=True) or 0
            )
            ### get usage based cache ###
-            if isinstance(model_group_cache, dict):
+            if (
                isinstance(model_group_cache, dict)
                and self.routing_strategy != "usage-based-routing-v2"
            ):
                model_group_cache[model_id] = model_group_cache.get(model_id, 0)
                current_request = max(
@ -2745,7 +2839,7 @@ class Router:
            if _rate_limit_error == True:  # allow generic fallback logic to take place
                raise ValueError(
-                    f"No deployments available for selected model, passed model={model}"
+                    f"{RouterErrors.no_deployments_available.value}, passed model={model}"
                )
            elif _context_window_error == True:
                raise litellm.ContextWindowExceededError(
@ -2883,6 +2977,11 @@ class Router:
                model=model, healthy_deployments=healthy_deployments, messages=messages
            )
        if len(healthy_deployments) == 0:
            raise ValueError(
                f"{RouterErrors.no_deployments_available.value}, passed model={model}"
            )
        if (
            self.routing_strategy == "usage-based-routing-v2"
            and self.lowesttpm_logger_v2 is not None
@ -2938,7 +3037,7 @@ class Router:
                f"get_available_deployment for model: {model}, No deployment available"
            )
            raise ValueError(
-                f"No deployments available for selected model, passed model={model}"
+                f"{RouterErrors.no_deployments_available.value}, passed model={model}"
            )
        verbose_router_logger.info(
            f"get_available_deployment for model: {model}, Selected deployment: {self.print_deployment(deployment)} for model: {model}"
@ -3068,7 +3167,7 @@ class Router:
                f"get_available_deployment for model: {model}, No deployment available"
            )
            raise ValueError(
-                f"No deployments available for selected model, passed model={model}"
+                f"{RouterErrors.no_deployments_available.value}, passed model={model}"
            )
        verbose_router_logger.info(
            f"get_available_deployment for model: {model}, Selected deployment: {self.print_deployment(deployment)} for model: {model}"
--- a/litellm/router_strategy/lowest_latency.py
+++ b/litellm/router_strategy/lowest_latency.py
@ -4,6 +4,7 @@ from pydantic import BaseModel, Extra, Field, root_validator
 import dotenv, os, requests, random
 from typing import Optional, Union, List, Dict
 from datetime import datetime, timedelta
 import random
 dotenv.load_dotenv()  # Loading env variables using dotenv
 import traceback
@ -29,6 +30,7 @@ class LiteLLMBase(BaseModel):
 class RoutingArgs(LiteLLMBase):
    ttl: int = 1 * 60 * 60  # 1 hour
    lowest_latency_buffer: float = 0
 class LowestLatencyLoggingHandler(CustomLogger):
@ -312,6 +314,14 @@ class LowestLatencyLoggingHandler(CustomLogger):
        except:
            input_tokens = 0
        # randomly sample from all_deployments, incase all deployments have latency=0.0
        _items = all_deployments.items()
        all_deployments = random.sample(list(_items), len(_items))
        all_deployments = dict(all_deployments)
        ### GET AVAILABLE DEPLOYMENTS ### filter out any deployments > tpm/rpm limits
        potential_deployments = []
        for item, item_map in all_deployments.items():
            ## get the item from model list
            _deployment = None
@ -345,23 +355,48 @@ class LowestLatencyLoggingHandler(CustomLogger):
                if isinstance(_call_latency, float):
                    total += _call_latency
            item_latency = total / len(item_latency)
-            if item_latency == 0:
+
-                deployment = _deployment
+            # -------------- #
-                break
+            # Debugging Logic
-            elif (
+            # -------------- #
            # We use _latency_per_deployment to log to langfuse, slack - this is not used to make a decision on routing
            # this helps a user to debug why the router picked a specfic deployment      #
            _deployment_api_base = _deployment.get("litellm_params", {}).get(
                "api_base", ""
            )
            if _deployment_api_base is not None:
                _latency_per_deployment[_deployment_api_base] = item_latency
            # -------------- #
            # End of Debugging Logic
            # -------------- #
            if (
                item_tpm + input_tokens > _deployment_tpm
                or item_rpm + 1 > _deployment_rpm
            ):  # if user passed in tpm / rpm in the model_list
                continue
-            elif item_latency < lowest_latency:
+            else:
-                lowest_latency = item_latency
+                potential_deployments.append((_deployment, item_latency))
-                deployment = _deployment
+
        if len(potential_deployments) == 0:
            return None
        # Sort potential deployments by latency
        sorted_deployments = sorted(potential_deployments, key=lambda x: x[1])
        # Find lowest latency deployment
        lowest_latency = sorted_deployments[0][1]
        # Find deployments within buffer of lowest latency
        buffer = self.routing_args.lowest_latency_buffer * lowest_latency
        valid_deployments = [
            x for x in sorted_deployments if x[1] <= lowest_latency + buffer
        ]
        # Pick a random deployment from valid deployments
        random_valid_deployment = random.choice(valid_deployments)
        deployment = random_valid_deployment[0]
            # _latency_per_deployment is used for debuggig
            _deployment_api_base = _deployment.get("litellm_params", {}).get(
                "api_base", ""
            )
            _latency_per_deployment[_deployment_api_base] = item_latency
        if request_kwargs is not None and "metadata" in request_kwargs:
            request_kwargs["metadata"][
                "_latency_per_deployment"
--- a/litellm/router_strategy/lowest_tpm_rpm.py
+++ b/litellm/router_strategy/lowest_tpm_rpm.py
@ -206,7 +206,7 @@ class LowestTPMLoggingHandler(CustomLogger):
            if item_tpm + input_tokens > _deployment_tpm:
                continue
            elif (rpm_dict is not None and item in rpm_dict) and (
-                rpm_dict[item] + 1 > _deployment_rpm
+                rpm_dict[item] + 1 >= _deployment_rpm
            ):
                continue
            elif item_tpm < lowest_tpm:
--- a/litellm/router_strategy/lowest_tpm_rpm_v2.py
+++ b/litellm/router_strategy/lowest_tpm_rpm_v2.py
@ -333,7 +333,7 @@ class LowestTPMLoggingHandler_v2(CustomLogger):
                    tpm_dict[tpm_key] = 0
        all_deployments = tpm_dict
-        deployment = None
+        potential_deployments = []  # if multiple deployments have the same low value
        for item, item_tpm in all_deployments.items():
            ## get the item from model list
            _deployment = None
@ -343,6 +343,8 @@ class LowestTPMLoggingHandler_v2(CustomLogger):
                    _deployment = m
            if _deployment is None:
                continue  # skip to next one
            elif item_tpm is None:
                continue  # skip if unhealthy deployment
            _deployment_tpm = None
            if _deployment_tpm is None:
@ -366,14 +368,20 @@ class LowestTPMLoggingHandler_v2(CustomLogger):
            if item_tpm + input_tokens > _deployment_tpm:
                continue
            elif (rpm_dict is not None and item in rpm_dict) and (
-                rpm_dict[item] + 1 > _deployment_rpm
+                rpm_dict[item] + 1 >= _deployment_rpm
            ):
                continue
            elif item_tpm == lowest_tpm:
                potential_deployments.append(_deployment)
            elif item_tpm < lowest_tpm:
                lowest_tpm = item_tpm
-                deployment = _deployment
+                potential_deployments = [_deployment]
        print_verbose("returning picked lowest tpm/rpm deployment.")
-        return deployment
+
        if len(potential_deployments) > 0:
            return random.choice(potential_deployments)
        else:
            return None
    async def async_get_available_deployments(
        self,
@ -394,6 +402,7 @@ class LowestTPMLoggingHandler_v2(CustomLogger):
        dt = get_utc_datetime()
        current_minute = dt.strftime("%H-%M")
        tpm_keys = []
        rpm_keys = []
        for m in healthy_deployments:
@ -416,7 +425,7 @@ class LowestTPMLoggingHandler_v2(CustomLogger):
        tpm_values = combined_tpm_rpm_values[: len(tpm_keys)]
        rpm_values = combined_tpm_rpm_values[len(tpm_keys) :]
-        return self._common_checks_available_deployment(
+        deployment = self._common_checks_available_deployment(
            model_group=model_group,
            healthy_deployments=healthy_deployments,
            tpm_keys=tpm_keys,
@ -427,6 +436,61 @@ class LowestTPMLoggingHandler_v2(CustomLogger):
            input=input,
        )
        try:
            assert deployment is not None
            return deployment
        except Exception as e:
            ### GET THE DICT OF TPM / RPM + LIMITS PER DEPLOYMENT ###
            deployment_dict = {}
            for index, _deployment in enumerate(healthy_deployments):
                if isinstance(_deployment, dict):
                    id = _deployment.get("model_info", {}).get("id")
                    ### GET DEPLOYMENT TPM LIMIT ###
                    _deployment_tpm = None
                    if _deployment_tpm is None:
                        _deployment_tpm = _deployment.get("tpm", None)
                    if _deployment_tpm is None:
                        _deployment_tpm = _deployment.get("litellm_params", {}).get(
                            "tpm", None
                        )
                    if _deployment_tpm is None:
                        _deployment_tpm = _deployment.get("model_info", {}).get(
                            "tpm", None
                        )
                    if _deployment_tpm is None:
                        _deployment_tpm = float("inf")
                    ### GET CURRENT TPM ###
                    current_tpm = tpm_values[index]
                    ### GET DEPLOYMENT TPM LIMIT ###
                    _deployment_rpm = None
                    if _deployment_rpm is None:
                        _deployment_rpm = _deployment.get("rpm", None)
                    if _deployment_rpm is None:
                        _deployment_rpm = _deployment.get("litellm_params", {}).get(
                            "rpm", None
                        )
                    if _deployment_rpm is None:
                        _deployment_rpm = _deployment.get("model_info", {}).get(
                            "rpm", None
                        )
                    if _deployment_rpm is None:
                        _deployment_rpm = float("inf")
                    ### GET CURRENT RPM ###
                    current_rpm = rpm_values[index]
                    deployment_dict[id] = {
                        "current_tpm": current_tpm,
                        "tpm_limit": _deployment_tpm,
                        "current_rpm": current_rpm,
                        "rpm_limit": _deployment_rpm,
                    }
            raise ValueError(
                f"{RouterErrors.no_deployments_available.value}. Passed model={model_group}. Deployments={deployment_dict}"
            )
    def get_available_deployments(
        self,
        model_group: str,
@ -464,7 +528,7 @@ class LowestTPMLoggingHandler_v2(CustomLogger):
            keys=rpm_keys
        )  # [1, 2, None, ..]
-        return self._common_checks_available_deployment(
+        deployment = self._common_checks_available_deployment(
            model_group=model_group,
            healthy_deployments=healthy_deployments,
            tpm_keys=tpm_keys,
@ -474,3 +538,58 @@ class LowestTPMLoggingHandler_v2(CustomLogger):
            messages=messages,
            input=input,
        )
        try:
            assert deployment is not None
            return deployment
        except Exception as e:
            ### GET THE DICT OF TPM / RPM + LIMITS PER DEPLOYMENT ###
            deployment_dict = {}
            for index, _deployment in enumerate(healthy_deployments):
                if isinstance(_deployment, dict):
                    id = _deployment.get("model_info", {}).get("id")
                    ### GET DEPLOYMENT TPM LIMIT ###
                    _deployment_tpm = None
                    if _deployment_tpm is None:
                        _deployment_tpm = _deployment.get("tpm", None)
                    if _deployment_tpm is None:
                        _deployment_tpm = _deployment.get("litellm_params", {}).get(
                            "tpm", None
                        )
                    if _deployment_tpm is None:
                        _deployment_tpm = _deployment.get("model_info", {}).get(
                            "tpm", None
                        )
                    if _deployment_tpm is None:
                        _deployment_tpm = float("inf")
                    ### GET CURRENT TPM ###
                    current_tpm = tpm_values[index]
                    ### GET DEPLOYMENT TPM LIMIT ###
                    _deployment_rpm = None
                    if _deployment_rpm is None:
                        _deployment_rpm = _deployment.get("rpm", None)
                    if _deployment_rpm is None:
                        _deployment_rpm = _deployment.get("litellm_params", {}).get(
                            "rpm", None
                        )
                    if _deployment_rpm is None:
                        _deployment_rpm = _deployment.get("model_info", {}).get(
                            "rpm", None
                        )
                    if _deployment_rpm is None:
                        _deployment_rpm = float("inf")
                    ### GET CURRENT RPM ###
                    current_rpm = rpm_values[index]
                    deployment_dict[id] = {
                        "current_tpm": current_tpm,
                        "tpm_limit": _deployment_tpm,
                        "current_rpm": current_rpm,
                        "rpm_limit": _deployment_rpm,
                    }
            raise ValueError(
                f"{RouterErrors.no_deployments_available.value}. Passed model={model_group}. Deployments={deployment_dict}"
            )
--- a/litellm/tests/conftest.py
+++ b/litellm/tests/conftest.py
@ -19,6 +19,7 @@ def setup_and_teardown():
        0, os.path.abspath("../..")
    )  # Adds the project directory to the system path
    import litellm
    from litellm import Router
    importlib.reload(litellm)
    import asyncio
--- a/litellm/tests/test_acooldowns_router.py
+++ b/litellm/tests/test_acooldowns_router.py
@ -119,7 +119,9 @@ def test_multiple_deployments_parallel():
 # test_multiple_deployments_parallel()
-def test_cooldown_same_model_name():
+@pytest.mark.parametrize("sync_mode", [True, False])
@pytest.mark.asyncio
 async def test_cooldown_same_model_name(sync_mode):
    # users could have the same model with different api_base
    # example
    # azure/chatgpt, api_base: 1234
@ -161,22 +163,40 @@ def test_cooldown_same_model_name():
            num_retries=3,
        )  # type: ignore
-        response = router.completion(
+        if sync_mode:
-            model="gpt-3.5-turbo",
+            response = router.completion(
-            messages=[{"role": "user", "content": "hello this request will pass"}],
+                model="gpt-3.5-turbo",
-        )
+                messages=[{"role": "user", "content": "hello this request will pass"}],
-        print(router.model_list)
+            )
-        model_ids = []
+            print(router.model_list)
-        for model in router.model_list:
+            model_ids = []
-            model_ids.append(model["model_info"]["id"])
+            for model in router.model_list:
-        print("\n litellm model ids ", model_ids)
+                model_ids.append(model["model_info"]["id"])
            print("\n litellm model ids ", model_ids)
-        # example litellm_model_names ['azure/chatgpt-v-2-ModelID-64321', 'azure/chatgpt-v-2-ModelID-63960']
+            # example litellm_model_names ['azure/chatgpt-v-2-ModelID-64321', 'azure/chatgpt-v-2-ModelID-63960']
-        assert (
+            assert (
-            model_ids[0] != model_ids[1]
+                model_ids[0] != model_ids[1]
-        )  # ensure both models have a uuid added, and they have different names
+            )  # ensure both models have a uuid added, and they have different names
-        print("\ngot response\n", response)
+            print("\ngot response\n", response)
        else:
            response = await router.acompletion(
                model="gpt-3.5-turbo",
                messages=[{"role": "user", "content": "hello this request will pass"}],
            )
            print(router.model_list)
            model_ids = []
            for model in router.model_list:
                model_ids.append(model["model_info"]["id"])
            print("\n litellm model ids ", model_ids)
            # example litellm_model_names ['azure/chatgpt-v-2-ModelID-64321', 'azure/chatgpt-v-2-ModelID-63960']
            assert (
                model_ids[0] != model_ids[1]
            )  # ensure both models have a uuid added, and they have different names
            print("\ngot response\n", response)
    except Exception as e:
        pytest.fail(f"Got unexpected exception on router! - {e}")
--- a/litellm/tests/test_alangfuse.py
+++ b/litellm/tests/test_alangfuse.py
@ -161,40 +161,56 @@ async def make_async_calls():
    return total_time
-# def test_langfuse_logging_async_text_completion():
+@pytest.mark.asyncio
-#     try:
+@pytest.mark.parametrize("stream", [False, True])
-#         pre_langfuse_setup()
+async def test_langfuse_logging_without_request_response(stream):
-#         litellm.set_verbose = False
+    try:
-#         litellm.success_callback = ["langfuse"]
+        import uuid
-#         async def _test_langfuse():
+        _unique_trace_name = f"litellm-test-{str(uuid.uuid4())}"
-#             response = await litellm.atext_completion(
+        litellm.set_verbose = True
-#                 model="gpt-3.5-turbo-instruct",
+        litellm.turn_off_message_logging = True
-#                 prompt="this is a test",
+        litellm.success_callback = ["langfuse"]
-#                 max_tokens=5,
+        response = await litellm.acompletion(
-#                 temperature=0.7,
+            model="gpt-3.5-turbo",
-#                 timeout=5,
+            mock_response="It's simple to use and easy to get started",
-#                 user="test_user",
+            messages=[{"role": "user", "content": "Hi 👋 - i'm claude"}],
-#                 stream=True
+            max_tokens=10,
-#             )
+            temperature=0.2,
-#             async for chunk in response:
+            stream=stream,
-#                 print()
+            metadata={"trace_id": _unique_trace_name},
-#                 print(chunk)
+        )
-#             await asyncio.sleep(1)
+        print(response)
-#             return response
+        if stream:
            async for chunk in response:
                print(chunk)
-#         response = asyncio.run(_test_langfuse())
+        await asyncio.sleep(3)
 #         print(f"response: {response}")
-#         # # check langfuse.log to see if there was a failed response
+        import langfuse
 #         search_logs("langfuse.log")
 #     except litellm.Timeout as e:
 #         pass
 #     except Exception as e:
 #         pytest.fail(f"An exception occurred - {e}")
        langfuse_client = langfuse.Langfuse(
            public_key=os.environ["LANGFUSE_PUBLIC_KEY"],
            secret_key=os.environ["LANGFUSE_SECRET_KEY"],
        )
-# test_langfuse_logging_async_text_completion()
+        # get trace with _unique_trace_name
        trace = langfuse_client.get_generations(trace_id=_unique_trace_name)
        print("trace_from_langfuse", trace)
        _trace_data = trace.data
        assert _trace_data[0].input == {"messages": "redacted-by-litellm"}
        assert _trace_data[0].output == {
            "role": "assistant",
            "content": "redacted-by-litellm",
            "function_call": None,
            "tool_calls": None,
        }
    except Exception as e:
        pytest.fail(f"An exception occurred - {e}")
@pytest.mark.skip(reason="beta test - checking langfuse output")
@ -334,6 +350,220 @@ def test_langfuse_logging_function_calling():
 # test_langfuse_logging_function_calling()
 def test_langfuse_existing_trace_id():
    """
    When existing trace id is passed, don't set trace params -> prevents overwriting the trace
    Pass 1 logging object with a trace
    Pass 2nd logging object with the trace id
    Assert no changes to the trace
    """
    # Test - if the logs were sent to the correct team on langfuse
    import litellm, datetime
    from litellm.integrations.langfuse import LangFuseLogger
    langfuse_Logger = LangFuseLogger(
        langfuse_public_key=os.getenv("LANGFUSE_PROJECT2_PUBLIC"),
        langfuse_secret=os.getenv("LANGFUSE_PROJECT2_SECRET"),
    )
    litellm.success_callback = ["langfuse"]
    # langfuse_args = {'kwargs': { 'start_time':  'end_time': datetime.datetime(2024, 5, 1, 7, 31, 29, 903685), 'user_id': None, 'print_verbose': <function print_verbose at 0x109d1f420>, 'level': 'DEFAULT', 'status_message': None}
    response_obj = litellm.ModelResponse(
        id="chatcmpl-9K5HUAbVRqFrMZKXL0WoC295xhguY",
        choices=[
            litellm.Choices(
                finish_reason="stop",
                index=0,
                message=litellm.Message(
                    content="I'm sorry, I am an AI assistant and do not have real-time information. I recommend checking a reliable weather website or app for the most up-to-date weather information in Boston.",
                    role="assistant",
                ),
            )
        ],
        created=1714573888,
        model="gpt-3.5-turbo-0125",
        object="chat.completion",
        system_fingerprint="fp_3b956da36b",
        usage=litellm.Usage(completion_tokens=37, prompt_tokens=14, total_tokens=51),
    )
    ### NEW TRACE ###
    message = [{"role": "user", "content": "what's the weather in boston"}]
    langfuse_args = {
        "response_obj": response_obj,
        "kwargs": {
            "model": "gpt-3.5-turbo",
            "litellm_params": {
                "acompletion": False,
                "api_key": None,
                "force_timeout": 600,
                "logger_fn": None,
                "verbose": False,
                "custom_llm_provider": "openai",
                "api_base": "https://api.openai.com/v1/",
                "litellm_call_id": "508113a1-c6f1-48ce-a3e1-01c6cce9330e",
                "model_alias_map": {},
                "completion_call_id": None,
                "metadata": None,
                "model_info": None,
                "proxy_server_request": None,
                "preset_cache_key": None,
                "no-log": False,
                "stream_response": {},
            },
            "messages": message,
            "optional_params": {"temperature": 0.1, "extra_body": {}},
            "start_time": "2024-05-01 07:31:27.986164",
            "stream": False,
            "user": None,
            "call_type": "completion",
            "litellm_call_id": "508113a1-c6f1-48ce-a3e1-01c6cce9330e",
            "completion_start_time": "2024-05-01 07:31:29.903685",
            "temperature": 0.1,
            "extra_body": {},
            "input": [{"role": "user", "content": "what's the weather in boston"}],
            "api_key": "my-api-key",
            "additional_args": {
                "complete_input_dict": {
                    "model": "gpt-3.5-turbo",
                    "messages": [
                        {"role": "user", "content": "what's the weather in boston"}
                    ],
                    "temperature": 0.1,
                    "extra_body": {},
                }
            },
            "log_event_type": "successful_api_call",
            "end_time": "2024-05-01 07:31:29.903685",
            "cache_hit": None,
            "response_cost": 6.25e-05,
        },
        "start_time": datetime.datetime(2024, 5, 1, 7, 31, 27, 986164),
        "end_time": datetime.datetime(2024, 5, 1, 7, 31, 29, 903685),
        "user_id": None,
        "print_verbose": litellm.print_verbose,
        "level": "DEFAULT",
        "status_message": None,
    }
    langfuse_response_object = langfuse_Logger.log_event(**langfuse_args)
    import langfuse
    langfuse_client = langfuse.Langfuse(
        public_key=os.getenv("LANGFUSE_PROJECT2_PUBLIC"),
        secret_key=os.getenv("LANGFUSE_PROJECT2_SECRET"),
    )
    trace_id = langfuse_response_object["trace_id"]
    langfuse_client.flush()
    time.sleep(2)
    print(langfuse_client.get_trace(id=trace_id))
    initial_langfuse_trace = langfuse_client.get_trace(id=trace_id)
    ### EXISTING TRACE ###
    new_metadata = {"existing_trace_id": trace_id}
    new_messages = [{"role": "user", "content": "What do you know?"}]
    new_response_obj = litellm.ModelResponse(
        id="chatcmpl-9K5HUAbVRqFrMZKXL0WoC295xhguY",
        choices=[
            litellm.Choices(
                finish_reason="stop",
                index=0,
                message=litellm.Message(
                    content="What do I know?",
                    role="assistant",
                ),
            )
        ],
        created=1714573888,
        model="gpt-3.5-turbo-0125",
        object="chat.completion",
        system_fingerprint="fp_3b956da36b",
        usage=litellm.Usage(completion_tokens=37, prompt_tokens=14, total_tokens=51),
    )
    langfuse_args = {
        "response_obj": new_response_obj,
        "kwargs": {
            "model": "gpt-3.5-turbo",
            "litellm_params": {
                "acompletion": False,
                "api_key": None,
                "force_timeout": 600,
                "logger_fn": None,
                "verbose": False,
                "custom_llm_provider": "openai",
                "api_base": "https://api.openai.com/v1/",
                "litellm_call_id": "508113a1-c6f1-48ce-a3e1-01c6cce9330e",
                "model_alias_map": {},
                "completion_call_id": None,
                "metadata": new_metadata,
                "model_info": None,
                "proxy_server_request": None,
                "preset_cache_key": None,
                "no-log": False,
                "stream_response": {},
            },
            "messages": new_messages,
            "optional_params": {"temperature": 0.1, "extra_body": {}},
            "start_time": "2024-05-01 07:31:27.986164",
            "stream": False,
            "user": None,
            "call_type": "completion",
            "litellm_call_id": "508113a1-c6f1-48ce-a3e1-01c6cce9330e",
            "completion_start_time": "2024-05-01 07:31:29.903685",
            "temperature": 0.1,
            "extra_body": {},
            "input": [{"role": "user", "content": "what's the weather in boston"}],
            "api_key": "my-api-key",
            "additional_args": {
                "complete_input_dict": {
                    "model": "gpt-3.5-turbo",
                    "messages": [
                        {"role": "user", "content": "what's the weather in boston"}
                    ],
                    "temperature": 0.1,
                    "extra_body": {},
                }
            },
            "log_event_type": "successful_api_call",
            "end_time": "2024-05-01 07:31:29.903685",
            "cache_hit": None,
            "response_cost": 6.25e-05,
        },
        "start_time": datetime.datetime(2024, 5, 1, 7, 31, 27, 986164),
        "end_time": datetime.datetime(2024, 5, 1, 7, 31, 29, 903685),
        "user_id": None,
        "print_verbose": litellm.print_verbose,
        "level": "DEFAULT",
        "status_message": None,
    }
    langfuse_response_object = langfuse_Logger.log_event(**langfuse_args)
    new_trace_id = langfuse_response_object["trace_id"]
    assert new_trace_id == trace_id
    langfuse_client.flush()
    time.sleep(2)
    print(langfuse_client.get_trace(id=trace_id))
    new_langfuse_trace = langfuse_client.get_trace(id=trace_id)
    assert dict(initial_langfuse_trace) == dict(new_langfuse_trace)
 def test_langfuse_logging_tool_calling():
    litellm.set_verbose = True
--- a/litellm/tests/test_alerting.py
+++ b/litellm/tests/test_alerting.py
@ -68,6 +68,7 @@ async def test_get_api_base():
    await _pl.alerting_handler(
        message=slow_message + request_info,
        level="Low",
        alert_type="llm_too_slow",
    )
    print("passed test_get_api_base")
--- a/litellm/tests/test_amazing_vertex_completion.py
+++ b/litellm/tests/test_amazing_vertex_completion.py
@ -394,6 +394,8 @@ async def test_async_vertexai_response():
            pass
        except litellm.Timeout as e:
            pass
        except litellm.APIError as e:
            pass
        except Exception as e:
            pytest.fail(f"An exception occurred: {e}")
@ -636,7 +638,10 @@ def test_gemini_pro_function_calling():
 # gemini_pro_function_calling()
-def test_gemini_pro_function_calling_streaming():
+@pytest.mark.parametrize("stream", [False, True])
@pytest.mark.parametrize("sync_mode", [False, True])
@pytest.mark.asyncio
 async def test_gemini_pro_function_calling_streaming(stream, sync_mode):
    load_vertex_ai_credentials()
    litellm.set_verbose = True
    tools = [
@ -665,19 +670,41 @@ def test_gemini_pro_function_calling_streaming():
            "content": "What's the weather like in Boston today in fahrenheit?",
        }
    ]
    optional_params = {
        "tools": tools,
        "tool_choice": "auto",
        "n": 1,
        "stream": stream,
        "temperature": 0.1,
    }
    try:
-        completion = litellm.completion(
+        if sync_mode == True:
-            model="gemini-pro",
+            response = litellm.completion(
-            messages=messages,
+                model="gemini-pro", messages=messages, **optional_params
-            tools=tools,
+            )
-            tool_choice="auto",
+            print(f"completion: {response}")
-            stream=True,
+
-        )
+            if stream == True:
-        print(f"completion: {completion}")
+                # assert completion.choices[0].message.content is None
-        # assert completion.choices[0].message.content is None
+                # assert len(completion.choices[0].message.tool_calls) == 1
-        # assert len(completion.choices[0].message.tool_calls) == 1
+                for chunk in response:
-        for chunk in completion:
+                    assert isinstance(chunk, litellm.ModelResponse)
-            print(f"chunk: {chunk}")
+            else:
                assert isinstance(response, litellm.ModelResponse)
        else:
            response = await litellm.acompletion(
                model="gemini-pro", messages=messages, **optional_params
            )
            print(f"completion: {response}")
            if stream == True:
                # assert completion.choices[0].message.content is None
                # assert len(completion.choices[0].message.tool_calls) == 1
                async for chunk in response:
                    print(f"chunk: {chunk}")
                    assert isinstance(chunk, litellm.ModelResponse)
            else:
                assert isinstance(response, litellm.ModelResponse)
    except litellm.APIError as e:
        pass
    except litellm.RateLimitError as e:
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@ -57,7 +57,7 @@ def test_completion_custom_provider_model_name():
            messages=messages,
            logger_fn=logger_fn,
        )
-        # Add any assertions here to, check the response
+        # Add any assertions here to,check the response
        print(response)
        print(response["choices"][0]["finish_reason"])
    except litellm.Timeout as e:
@ -231,6 +231,76 @@ def test_completion_claude_3_function_call():
        pytest.fail(f"Error occurred: {e}")
 def test_completion_cohere_command_r_plus_function_call():
    litellm.set_verbose = True
    tools = [
        {
            "type": "function",
            "function": {
                "name": "get_current_weather",
                "description": "Get the current weather in a given location",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "location": {
                            "type": "string",
                            "description": "The city and state, e.g. San Francisco, CA",
                        },
                        "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
                    },
                    "required": ["location"],
                },
            },
        }
    ]
    messages = [
        {
            "role": "user",
            "content": "What's the weather like in Boston today in Fahrenheit?",
        }
    ]
    try:
        # test without max tokens
        response = completion(
            model="command-r-plus",
            messages=messages,
            tools=tools,
            tool_choice="auto",
        )
        # Add any assertions, here to check response args
        print(response)
        assert isinstance(response.choices[0].message.tool_calls[0].function.name, str)
        assert isinstance(
            response.choices[0].message.tool_calls[0].function.arguments, str
        )
        messages.append(
            response.choices[0].message.model_dump()
        )  # Add assistant tool invokes
        tool_result = (
            '{"location": "Boston", "temperature": "72", "unit": "fahrenheit"}'
        )
        # Add user submitted tool results in the OpenAI format
        messages.append(
            {
                "tool_call_id": response.choices[0].message.tool_calls[0].id,
                "role": "tool",
                "name": response.choices[0].message.tool_calls[0].function.name,
                "content": tool_result,
            }
        )
        # In the second response, Cohere should deduce answer from tool results
        second_response = completion(
            model="command-r-plus",
            messages=messages,
            tools=tools,
            tool_choice="auto",
        )
        print(second_response)
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
 def test_parse_xml_params():
    from litellm.llms.prompt_templates.factory import parse_xml_params
@ -1291,6 +1361,7 @@ def test_completion_logprobs_stream():
        for chunk in response:
            # check if atleast one chunk has log probs
            print(chunk)
            print(f"chunk.choices[0]: {chunk.choices[0]}")
            if "logprobs" in chunk.choices[0]:
                # assert we got a valid logprob in the choices
                assert len(chunk.choices[0].logprobs.content[0].top_logprobs) == 3
@ -1781,7 +1852,6 @@ def test_completion_replicate_llama3():
        print("RESPONSE STRING\n", response_str)
        if type(response_str) != str:
            pytest.fail(f"Error occurred: {e}")
        raise Exception("it worked!")
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
@ -2655,6 +2725,88 @@ def test_completion_palm_stream():
        pytest.fail(f"Error occurred: {e}")
 def test_completion_watsonx():
    litellm.set_verbose = True
    model_name = "watsonx/ibm/granite-13b-chat-v2"
    try:
        response = completion(
            model=model_name,
            messages=messages,
            stop=["stop"],
            max_tokens=20,
        )
        # Add any assertions here to check the response
        print(response)
    except litellm.APIError as e:
        pass
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
@pytest.mark.parametrize(
    "provider, model, project, region_name, token",
    [
        ("azure", "chatgpt-v-2", None, None, "test-token"),
        ("vertex_ai", "anthropic-claude-3", "adroit-crow-1", "us-east1", None),
        ("watsonx", "ibm/granite", "96946574", "dallas", "1234"),
        ("bedrock", "anthropic.claude-3", None, "us-east-1", None),
    ],
 )
 def test_unified_auth_params(provider, model, project, region_name, token):
    """
    Check if params = ["project", "region_name", "token"]
    are correctly translated for = ["azure", "vertex_ai", "watsonx", "aws"]
    tests get_optional_params
    """
    data = {
        "project": project,
        "region_name": region_name,
        "token": token,
        "custom_llm_provider": provider,
        "model": model,
    }
    translated_optional_params = litellm.utils.get_optional_params(**data)
    if provider == "azure":
        special_auth_params = (
            litellm.AzureOpenAIConfig().get_mapped_special_auth_params()
        )
    elif provider == "bedrock":
        special_auth_params = (
            litellm.AmazonBedrockGlobalConfig().get_mapped_special_auth_params()
        )
    elif provider == "vertex_ai":
        special_auth_params = litellm.VertexAIConfig().get_mapped_special_auth_params()
    elif provider == "watsonx":
        special_auth_params = (
            litellm.IBMWatsonXAIConfig().get_mapped_special_auth_params()
        )
    for param, value in special_auth_params.items():
        assert param in data
        assert value in translated_optional_params
@pytest.mark.asyncio
 async def test_acompletion_watsonx():
    litellm.set_verbose = True
    model_name = "watsonx/ibm/granite-13b-chat-v2"
    print("testing watsonx")
    try:
        response = await litellm.acompletion(
            model=model_name,
            messages=messages,
            temperature=0.2,
            max_tokens=80,
        )
        # Add any assertions here to check the response
        print(response)
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
 # test_completion_palm_stream()
 # test_completion_deep_infra()
--- a/litellm/tests/test_completion_cost.py
+++ b/litellm/tests/test_completion_cost.py
@ -328,3 +328,56 @@ def test_dalle_3_azure_cost_tracking():
        completion_response=response, call_type="image_generation"
    )
    assert cost > 0
 def test_replicate_llama3_cost_tracking():
    litellm.set_verbose = True
    model = "replicate/meta/meta-llama-3-8b-instruct"
    litellm.register_model(
        {
            "replicate/meta/meta-llama-3-8b-instruct": {
                "input_cost_per_token": 0.00000005,
                "output_cost_per_token": 0.00000025,
                "litellm_provider": "replicate",
            }
        }
    )
    response = litellm.ModelResponse(
        id="chatcmpl-cad7282f-7f68-41e7-a5ab-9eb33ae301dc",
        choices=[
            litellm.utils.Choices(
                finish_reason="stop",
                index=0,
                message=litellm.utils.Message(
                    content="I'm doing well, thanks for asking! I'm here to help you with any questions or tasks you may have. How can I assist you today?",
                    role="assistant",
                ),
            )
        ],
        created=1714401369,
        model="replicate/meta/meta-llama-3-8b-instruct",
        object="chat.completion",
        system_fingerprint=None,
        usage=litellm.utils.Usage(
            prompt_tokens=48, completion_tokens=31, total_tokens=79
        ),
    )
    cost = litellm.completion_cost(
        completion_response=response,
        messages=[{"role": "user", "content": "Hey, how's it going?"}],
    )
    print(f"cost: {cost}")
    cost = round(cost, 5)
    expected_cost = round(
        litellm.model_cost["replicate/meta/meta-llama-3-8b-instruct"][
            "input_cost_per_token"
        ]
        * 48
        + litellm.model_cost["replicate/meta/meta-llama-3-8b-instruct"][
            "output_cost_per_token"
        ]
        * 31,
        5,
    )
    assert cost == expected_cost
--- a/litellm/tests/test_config.py
+++ b/litellm/tests/test_config.py
@ -26,6 +26,9 @@ class DBModel(BaseModel):
    model_info: dict
    litellm_params: dict
    class Config:
        protected_namespaces = ()
@pytest.mark.asyncio
 async def test_delete_deployment():
--- a/litellm/tests/test_custom_callback_input.py
+++ b/litellm/tests/test_custom_callback_input.py
@ -529,6 +529,7 @@ def test_chat_bedrock_stream():
@pytest.mark.asyncio
 async def test_async_chat_bedrock_stream():
    try:
        litellm.set_verbose = True
        customHandler = CompletionCustomHandler()
        litellm.callbacks = [customHandler]
        response = await litellm.acompletion(
--- a/litellm/tests/test_embedding.py
+++ b/litellm/tests/test_embedding.py
@ -484,6 +484,20 @@ def test_mistral_embeddings():
        pytest.fail(f"Error occurred: {e}")
@pytest.mark.skip(reason="local test")
 def test_watsonx_embeddings():
    try:
        litellm.set_verbose = True
        response = litellm.embedding(
            model="watsonx/ibm/slate-30m-english-rtrvr",
            input=["good morning from litellm"],
        )
        print(f"response: {response}")
        assert isinstance(response.usage, litellm.Usage)
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
 # test_mistral_embeddings()
--- a/litellm/tests/test_function_setup.py
+++ b/litellm/tests/test_function_setup.py
@ -25,7 +25,7 @@ def test_empty_content():
        pass
    function_setup(
-        original_function=completion,
+        original_function="completion",
        rules_obj=rules_obj,
        start_time=datetime.now(),
        messages=[],
--- a/litellm/tests/test_image_generation.py
+++ b/litellm/tests/test_image_generation.py
@ -136,8 +136,8 @@ def test_image_generation_bedrock():
        litellm.set_verbose = True
        response = litellm.image_generation(
            prompt="A cute baby sea otter",
-            model="bedrock/stability.stable-diffusion-xl-v0",
+            model="bedrock/stability.stable-diffusion-xl-v1",
-            aws_region_name="us-east-1",
+            aws_region_name="us-west-2",
        )
        print(f"response: {response}")
    except litellm.RateLimitError as e:
@ -156,8 +156,8 @@ async def test_aimage_generation_bedrock_with_optional_params():
    try:
        response = await litellm.aimage_generation(
            prompt="A cute baby sea otter",
-            model="bedrock/stability.stable-diffusion-xl-v0",
+            model="bedrock/stability.stable-diffusion-xl-v1",
-            size="128x128",
+            size="256x256",
        )
        print(f"response: {response}")
    except litellm.RateLimitError as e:
--- a/litellm/tests/test_least_busy_routing.py
+++ b/litellm/tests/test_least_busy_routing.py
@ -201,6 +201,7 @@ async def test_router_atext_completion_streaming():
@pytest.mark.asyncio
 async def test_router_completion_streaming():
    litellm.set_verbose = True
    messages = [
        {"role": "user", "content": "Hello, can you generate a 500 words poem?"}
    ]
@ -219,9 +220,9 @@ async def test_router_completion_streaming():
        {
            "model_name": "azure-model",
            "litellm_params": {
-                "model": "azure/gpt-35-turbo",
+                "model": "azure/gpt-turbo",
-                "api_key": "os.environ/AZURE_EUROPE_API_KEY",
+                "api_key": "os.environ/AZURE_FRANCE_API_KEY",
-                "api_base": "https://my-endpoint-europe-berri-992.openai.azure.com",
+                "api_base": "https://openai-france-1234.openai.azure.com",
                "rpm": 6,
            },
            "model_info": {"id": 2},
@ -229,9 +230,9 @@ async def test_router_completion_streaming():
        {
            "model_name": "azure-model",
            "litellm_params": {
-                "model": "azure/gpt-35-turbo",
+                "model": "azure/gpt-turbo",
-                "api_key": "os.environ/AZURE_CANADA_API_KEY",
+                "api_key": "os.environ/AZURE_FRANCE_API_KEY",
-                "api_base": "https://my-endpoint-canada-berri992.openai.azure.com",
+                "api_base": "https://openai-france-1234.openai.azure.com",
                "rpm": 6,
            },
            "model_info": {"id": 3},
@ -262,4 +263,4 @@ async def test_router_completion_streaming():
    ## check if calls equally distributed
    cache_dict = router.cache.get_cache(key=cache_key)
    for k, v in cache_dict.items():
-        assert v == 1
+        assert v == 1, f"Failed. K={k} called v={v} times, cache_dict={cache_dict}"
--- a/litellm/tests/test_lowest_latency_routing.py
+++ b/litellm/tests/test_lowest_latency_routing.py
@ -555,3 +555,171 @@ async def test_lowest_latency_routing_with_timeouts():
    # ALL the Requests should have been routed to the fast-endpoint
    assert deployments["fast-endpoint"] == 10
@pytest.mark.asyncio
 async def test_lowest_latency_routing_first_pick():
    """
    PROD Test:
    - When all deployments are latency=0, it should randomly pick a deployment
    - IT SHOULD NEVER PICK THE Very First deployment everytime all deployment latencies are 0
    - This ensures that after the ttl window resets it randomly picks a deployment
    """
    import litellm
    litellm.set_verbose = True
    router = Router(
        model_list=[
            {
                "model_name": "azure-model",
                "litellm_params": {
                    "model": "openai/fast-endpoint",
                    "api_base": "https://exampleopenaiendpoint-production.up.railway.app/",
                    "api_key": "fake-key",
                },
                "model_info": {"id": "fast-endpoint"},
            },
            {
                "model_name": "azure-model",
                "litellm_params": {
                    "model": "openai/fast-endpoint-2",
                    "api_base": "https://exampleopenaiendpoint-production.up.railway.app/",
                    "api_key": "fake-key",
                },
                "model_info": {"id": "fast-endpoint-2"},
            },
            {
                "model_name": "azure-model",
                "litellm_params": {
                    "model": "openai/fast-endpoint-2",
                    "api_base": "https://exampleopenaiendpoint-production.up.railway.app/",
                    "api_key": "fake-key",
                },
                "model_info": {"id": "fast-endpoint-3"},
            },
            {
                "model_name": "azure-model",
                "litellm_params": {
                    "model": "openai/fast-endpoint-2",
                    "api_base": "https://exampleopenaiendpoint-production.up.railway.app/",
                    "api_key": "fake-key",
                },
                "model_info": {"id": "fast-endpoint-4"},
            },
        ],
        routing_strategy="latency-based-routing",
        routing_strategy_args={"ttl": 0.0000000001},
        set_verbose=True,
        debug_level="DEBUG",
    )  # type: ignore
    deployments = {}
    for _ in range(5):
        response = await router.acompletion(
            model="azure-model", messages=[{"role": "user", "content": "hello"}]
        )
        print(response)
        _picked_model_id = response._hidden_params["model_id"]
        if _picked_model_id not in deployments:
            deployments[_picked_model_id] = 1
        else:
            deployments[_picked_model_id] += 1
        await asyncio.sleep(0.000000000005)
    print("deployments", deployments)
    # assert that len(deployments) >1
    assert len(deployments) > 1
@pytest.mark.parametrize("buffer", [0, 1])
@pytest.mark.asyncio
 async def test_lowest_latency_routing_buffer(buffer):
    """
    Allow shuffling calls within a certain latency buffer
    """
    model_list = [
        {
            "model_name": "azure-model",
            "litellm_params": {
                "model": "azure/gpt-turbo",
                "api_key": "os.environ/AZURE_FRANCE_API_KEY",
                "api_base": "https://openai-france-1234.openai.azure.com",
                "rpm": 1440,
            },
            "model_info": {"id": 1},
        },
        {
            "model_name": "azure-model",
            "litellm_params": {
                "model": "azure/gpt-35-turbo",
                "api_key": "os.environ/AZURE_EUROPE_API_KEY",
                "api_base": "https://my-endpoint-europe-berri-992.openai.azure.com",
                "rpm": 6,
            },
            "model_info": {"id": 2},
        },
    ]
    router = Router(
        model_list=model_list,
        routing_strategy="latency-based-routing",
        set_verbose=False,
        num_retries=3,
        routing_strategy_args={"lowest_latency_buffer": buffer},
    )  # type: ignore
    ## DEPLOYMENT 1 ##
    deployment_id = 1
    kwargs = {
        "litellm_params": {
            "metadata": {
                "model_group": "azure-model",
            },
            "model_info": {"id": 1},
        }
    }
    start_time = time.time()
    response_obj = {"usage": {"total_tokens": 50}}
    time.sleep(3)
    end_time = time.time()
    router.lowestlatency_logger.log_success_event(
        response_obj=response_obj,
        kwargs=kwargs,
        start_time=start_time,
        end_time=end_time,
    )
    ## DEPLOYMENT 2 ##
    deployment_id = 2
    kwargs = {
        "litellm_params": {
            "metadata": {
                "model_group": "azure-model",
            },
            "model_info": {"id": 2},
        }
    }
    start_time = time.time()
    response_obj = {"usage": {"total_tokens": 20}}
    time.sleep(2)
    end_time = time.time()
    router.lowestlatency_logger.log_success_event(
        response_obj=response_obj,
        kwargs=kwargs,
        start_time=start_time,
        end_time=end_time,
    )
    ## CHECK WHAT'S SELECTED ##
    # print(router.lowesttpm_logger.get_available_deployments(model_group="azure-model"))
    selected_deployments = {}
    for _ in range(50):
        print(router.get_available_deployment(model="azure-model"))
        selected_deployments[
            router.get_available_deployment(model="azure-model")["model_info"]["id"]
        ] = 1
    if buffer == 0:
        assert len(selected_deployments.keys()) == 1
    else:
        assert len(selected_deployments.keys()) == 2
--- a/litellm/tests/test_pydantic_namespaces.py
+++ b/litellm/tests/test_pydantic_namespaces.py
@ -0,0 +1,10 @@
 import warnings
 import pytest
 def test_namespace_conflict_warning():
    with warnings.catch_warnings(record=True) as recorded_warnings:
        warnings.simplefilter("always")  # Capture all warnings
        import litellm
    # Check that no warning with the specific message was raised
    assert not any("conflict with protected namespace" in str(w.message) for w in recorded_warnings), "Test failed: 'conflict with protected namespace' warning was encountered!"
--- a/litellm/tests/test_router.py
+++ b/litellm/tests/test_router.py
@ -1,7 +1,7 @@
 #### What this tests ####
 # This tests litellm router
-import sys, os, time
+import sys, os, time, openai
 import traceback, asyncio
 import pytest
@ -14,10 +14,169 @@ from litellm.router import Deployment, LiteLLM_Params, ModelInfo
 from concurrent.futures import ThreadPoolExecutor
 from collections import defaultdict
 from dotenv import load_dotenv
 import os, httpx
 load_dotenv()
@pytest.mark.parametrize("num_retries", [None, 2])
@pytest.mark.parametrize("max_retries", [None, 4])
 def test_router_num_retries_init(num_retries, max_retries):
    """
    - test when num_retries set v/s not
    - test client value when max retries set v/s not
    """
    router = Router(
        model_list=[
            {
                "model_name": "gpt-3.5-turbo",  # openai model name
                "litellm_params": {  # params for litellm completion/embedding call
                    "model": "azure/chatgpt-v-2",
                    "api_key": "bad-key",
                    "api_version": os.getenv("AZURE_API_VERSION"),
                    "api_base": os.getenv("AZURE_API_BASE"),
                    "max_retries": max_retries,
                },
                "model_info": {"id": 12345},
            },
        ],
        num_retries=num_retries,
    )
    if num_retries is not None:
        assert router.num_retries == num_retries
    else:
        assert router.num_retries == openai.DEFAULT_MAX_RETRIES
    model_client = router._get_client(
        {"model_info": {"id": 12345}}, client_type="async", kwargs={}
    )
    if max_retries is not None:
        assert getattr(model_client, "max_retries") == max_retries
    else:
        assert getattr(model_client, "max_retries") == 0
@pytest.mark.parametrize(
    "timeout", [10, 1.0, httpx.Timeout(timeout=300.0, connect=20.0)]
 )
@pytest.mark.parametrize("ssl_verify", [True, False])
 def test_router_timeout_init(timeout, ssl_verify):
    """
    Allow user to pass httpx.Timeout
    related issue - https://github.com/BerriAI/litellm/issues/3162
    """
    litellm.ssl_verify = ssl_verify
    router = Router(
        model_list=[
            {
                "model_name": "test-model",
                "litellm_params": {
                    "model": "azure/chatgpt-v-2",
                    "api_key": os.getenv("AZURE_API_KEY"),
                    "api_base": os.getenv("AZURE_API_BASE"),
                    "api_version": os.getenv("AZURE_API_VERSION"),
                    "timeout": timeout,
                },
                "model_info": {"id": 1234},
            }
        ]
    )
    model_client = router._get_client(
        deployment={"model_info": {"id": 1234}}, client_type="sync_client", kwargs={}
    )
    assert getattr(model_client, "timeout") == timeout
    print(f"vars model_client: {vars(model_client)}")
    http_client = getattr(model_client, "_client")
    print(f"http client: {vars(http_client)}, ssl_Verify={ssl_verify}")
    if ssl_verify == False:
        assert http_client._transport._pool._ssl_context.verify_mode.name == "CERT_NONE"
    else:
        assert (
            http_client._transport._pool._ssl_context.verify_mode.name
            == "CERT_REQUIRED"
        )
@pytest.mark.parametrize("sync_mode", [False, True])
@pytest.mark.asyncio
 async def test_router_retries(sync_mode):
    """
    - make sure retries work as expected
    """
    model_list = [
        {
            "model_name": "gpt-3.5-turbo",
            "litellm_params": {"model": "gpt-3.5-turbo", "api_key": "bad-key"},
        },
        {
            "model_name": "gpt-3.5-turbo",
            "litellm_params": {
                "model": "azure/chatgpt-v-2",
                "api_key": os.getenv("AZURE_API_KEY"),
                "api_base": os.getenv("AZURE_API_BASE"),
                "api_version": os.getenv("AZURE_API_VERSION"),
            },
        },
    ]
    router = Router(model_list=model_list, num_retries=2)
    if sync_mode:
        router.completion(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": "Hey, how's it going?"}],
        )
    else:
        await router.acompletion(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": "Hey, how's it going?"}],
        )
@pytest.mark.parametrize(
    "mistral_api_base",
    [
        "os.environ/AZURE_MISTRAL_API_BASE",
        "https://Mistral-large-nmefg-serverless.eastus2.inference.ai.azure.com/v1/",
        "https://Mistral-large-nmefg-serverless.eastus2.inference.ai.azure.com/v1",
        "https://Mistral-large-nmefg-serverless.eastus2.inference.ai.azure.com/",
        "https://Mistral-large-nmefg-serverless.eastus2.inference.ai.azure.com",
    ],
 )
 def test_router_azure_ai_studio_init(mistral_api_base):
    router = Router(
        model_list=[
            {
                "model_name": "test-model",
                "litellm_params": {
                    "model": "azure/mistral-large-latest",
                    "api_key": "os.environ/AZURE_MISTRAL_API_KEY",
                    "api_base": mistral_api_base,
                },
                "model_info": {"id": 1234},
            }
        ]
    )
    model_client = router._get_client(
        deployment={"model_info": {"id": 1234}}, client_type="sync_client", kwargs={}
    )
    url = getattr(model_client, "_base_url")
    uri_reference = str(getattr(url, "_uri_reference"))
    print(f"uri_reference: {uri_reference}")
    assert "/v1/" in uri_reference
    assert uri_reference.count("v1") == 1
 def test_exception_raising():
    # this tests if the router raises an exception when invalid params are set
    # in this test both deployments have bad keys - Keep this test. It validates if the router raises the most recent exception
@ -995,6 +1154,7 @@ def test_consistent_model_id():
    assert id1 == id2
@pytest.mark.skip(reason="local test")
 def test_reading_keys_os_environ():
    import openai
@ -1094,6 +1254,7 @@ def test_reading_keys_os_environ():
 # test_reading_keys_os_environ()
@pytest.mark.skip(reason="local test")
 def test_reading_openai_keys_os_environ():
    import openai
--- a/litellm/tests/test_router_debug_logs.py
+++ b/litellm/tests/test_router_debug_logs.py
@ -46,6 +46,7 @@ def test_async_fallbacks(caplog):
    router = Router(
        model_list=model_list,
        fallbacks=[{"gpt-3.5-turbo": ["azure/gpt-3.5-turbo"]}],
        num_retries=1,
    )
    user_message = "Hello, how are you?"
@ -81,8 +82,8 @@ def test_async_fallbacks(caplog):
    # Define the expected log messages
    # - error request, falling back notice, success notice
    expected_logs = [
        "Intialized router with Routing strategy: simple-shuffle\n\nRouting fallbacks: [{'gpt-3.5-turbo': ['azure/gpt-3.5-turbo']}]\n\nRouting context window fallbacks: None\n\nRouter Redis Caching=None",
        "litellm.acompletion(model=gpt-3.5-turbo)\x1b[31m Exception OpenAIException - Error code: 401 - {'error': {'message': 'Incorrect API key provided: bad-key. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}\x1b[0m",
        "litellm.acompletion(model=None)\x1b[31m Exception No deployments available for selected model, passed model=gpt-3.5-turbo\x1b[0m",
        "Falling back to model_group = azure/gpt-3.5-turbo",
        "litellm.acompletion(model=azure/chatgpt-v-2)\x1b[32m 200 OK\x1b[0m",
    ]
--- a/litellm/tests/test_router_fallbacks.py
+++ b/litellm/tests/test_router_fallbacks.py
@ -22,10 +22,10 @@ class MyCustomHandler(CustomLogger):
    def log_pre_api_call(self, model, messages, kwargs):
        print(f"Pre-API Call")
        print(
-            f"previous_models: {kwargs['litellm_params']['metadata']['previous_models']}"
+            f"previous_models: {kwargs['litellm_params']['metadata'].get('previous_models', None)}"
        )
-        self.previous_models += len(
+        self.previous_models = len(
-            kwargs["litellm_params"]["metadata"]["previous_models"]
+            kwargs["litellm_params"]["metadata"].get("previous_models", [])
        )  # {"previous_models": [{"model": litellm_model_name, "exception_type": AuthenticationError, "exception_string": <complete_traceback>}]}
        print(f"self.previous_models: {self.previous_models}")
@ -127,7 +127,7 @@ def test_sync_fallbacks():
        response = router.completion(**kwargs)
        print(f"response: {response}")
        time.sleep(0.05)  # allow a delay as success_callbacks are on a separate thread
-        assert customHandler.previous_models == 1  # 0 retries, 1 fallback
+        assert customHandler.previous_models == 4
        print("Passed ! Test router_fallbacks: test_sync_fallbacks()")
        router.reset()
@ -140,7 +140,7 @@ def test_sync_fallbacks():
@pytest.mark.asyncio
 async def test_async_fallbacks():
-    litellm.set_verbose = False
+    litellm.set_verbose = True
    model_list = [
        {  # list of model deployments
            "model_name": "azure/gpt-3.5-turbo",  # openai model name
@ -209,12 +209,13 @@ async def test_async_fallbacks():
    user_message = "Hello, how are you?"
    messages = [{"content": user_message, "role": "user"}]
    try:
        kwargs["model"] = "azure/gpt-3.5-turbo"
        response = await router.acompletion(**kwargs)
        print(f"customHandler.previous_models: {customHandler.previous_models}")
        await asyncio.sleep(
            0.05
        )  # allow a delay as success_callbacks are on a separate thread
-        assert customHandler.previous_models == 1  # 0 retries, 1 fallback
+        assert customHandler.previous_models == 4  # 1 init call, 2 retries, 1 fallback
        router.reset()
    except litellm.Timeout as e:
        pass
@ -268,7 +269,7 @@ def test_sync_fallbacks_embeddings():
        response = router.embedding(**kwargs)
        print(f"customHandler.previous_models: {customHandler.previous_models}")
        time.sleep(0.05)  # allow a delay as success_callbacks are on a separate thread
-        assert customHandler.previous_models == 1  # 0 retries, 1 fallback
+        assert customHandler.previous_models == 4  # 1 init call, 2 retries, 1 fallback
        router.reset()
    except litellm.Timeout as e:
        pass
@ -322,7 +323,7 @@ async def test_async_fallbacks_embeddings():
        await asyncio.sleep(
            0.05
        )  # allow a delay as success_callbacks are on a separate thread
-        assert customHandler.previous_models == 1  # 0 retries, 1 fallback
+        assert customHandler.previous_models == 4  # 1 init call, 2 retries, 1 fallback
        router.reset()
    except litellm.Timeout as e:
        pass
@ -401,7 +402,7 @@ def test_dynamic_fallbacks_sync():
        response = router.completion(**kwargs)
        print(f"response: {response}")
        time.sleep(0.05)  # allow a delay as success_callbacks are on a separate thread
-        assert customHandler.previous_models == 1  # 0 retries, 1 fallback
+        assert customHandler.previous_models == 4  # 1 init call, 2 retries, 1 fallback
        router.reset()
    except Exception as e:
        pytest.fail(f"An exception occurred - {e}")
@ -487,7 +488,7 @@ async def test_dynamic_fallbacks_async():
        await asyncio.sleep(
            0.05
        )  # allow a delay as success_callbacks are on a separate thread
-        assert customHandler.previous_models == 1  # 0 retries, 1 fallback
+        assert customHandler.previous_models == 4  # 1 init call, 2 retries, 1 fallback
        router.reset()
    except Exception as e:
        pytest.fail(f"An exception occurred - {e}")
@ -572,7 +573,7 @@ async def test_async_fallbacks_streaming():
        await asyncio.sleep(
            0.05
        )  # allow a delay as success_callbacks are on a separate thread
-        assert customHandler.previous_models == 1  # 0 retries, 1 fallback
+        assert customHandler.previous_models == 4  # 1 init call, 2 retries, 1 fallback
        router.reset()
    except litellm.Timeout as e:
        pass
@ -751,7 +752,7 @@ async def test_async_fallbacks_max_retries_per_request():
        router.reset()
-def test_usage_based_routing_fallbacks():
+def test_ausage_based_routing_fallbacks():
    try:
        # [Prod Test]
        # IT tests Usage Based Routing with fallbacks
@ -765,10 +766,10 @@ def test_usage_based_routing_fallbacks():
        load_dotenv()
        # Constants for TPM and RPM allocation
-        AZURE_FAST_TPM = 3
+        AZURE_FAST_RPM = 1
-        AZURE_BASIC_TPM = 4
+        AZURE_BASIC_RPM = 1
-        OPENAI_TPM = 400
+        OPENAI_RPM = 2
-        ANTHROPIC_TPM = 100000
+        ANTHROPIC_RPM = 100000
        def get_azure_params(deployment_name: str):
            params = {
@ -797,22 +798,26 @@ def test_usage_based_routing_fallbacks():
            {
                "model_name": "azure/gpt-4-fast",
                "litellm_params": get_azure_params("chatgpt-v-2"),
-                "tpm": AZURE_FAST_TPM,
+                "model_info": {"id": 1},
                "rpm": AZURE_FAST_RPM,
            },
            {
                "model_name": "azure/gpt-4-basic",
                "litellm_params": get_azure_params("chatgpt-v-2"),
-                "tpm": AZURE_BASIC_TPM,
+                "model_info": {"id": 2},
                "rpm": AZURE_BASIC_RPM,
            },
            {
                "model_name": "openai-gpt-4",
                "litellm_params": get_openai_params("gpt-3.5-turbo"),
-                "tpm": OPENAI_TPM,
+                "model_info": {"id": 3},
                "rpm": OPENAI_RPM,
            },
            {
                "model_name": "anthropic-claude-instant-1.2",
                "litellm_params": get_anthropic_params("claude-instant-1.2"),
-                "tpm": ANTHROPIC_TPM,
+                "model_info": {"id": 4},
                "rpm": ANTHROPIC_RPM,
            },
        ]
        # litellm.set_verbose=True
@ -830,6 +835,7 @@ def test_usage_based_routing_fallbacks():
            routing_strategy="usage-based-routing",
            redis_host=os.environ["REDIS_HOST"],
            redis_port=os.environ["REDIS_PORT"],
            num_retries=0,
        )
        messages = [
@ -842,10 +848,10 @@ def test_usage_based_routing_fallbacks():
            mock_response="very nice to meet you",
        )
        print("response: ", response)
-        print("response._hidden_params: ", response._hidden_params)
+        print(f"response._hidden_params: {response._hidden_params}")
        # in this test, we expect azure/gpt-4 fast to fail, then azure-gpt-4 basic to fail and then openai-gpt-4 to pass
        # the token count of this message is > AZURE_FAST_TPM, > AZURE_BASIC_TPM
-        assert response._hidden_params["custom_llm_provider"] == "openai"
+        assert response._hidden_params["model_id"] == "1"
        # now make 100 mock requests to OpenAI - expect it to fallback to anthropic-claude-instant-1.2
        for i in range(20):
@ -859,7 +865,7 @@ def test_usage_based_routing_fallbacks():
            print("response._hidden_params: ", response._hidden_params)
            if i == 19:
                # by the 19th call we should have hit TPM LIMIT for OpenAI, it should fallback to anthropic-claude-instant-1.2
-                assert response._hidden_params["custom_llm_provider"] == "anthropic"
+                assert response._hidden_params["model_id"] == "4"
    except Exception as e:
        pytest.fail(f"An exception occurred {e}")
--- a/litellm/tests/test_router_init.py
+++ b/litellm/tests/test_router_init.py
@ -203,7 +203,7 @@ def test_timeouts_router():
                },
            },
        ]
-        router = Router(model_list=model_list)
+        router = Router(model_list=model_list, num_retries=0)
        print("PASSED !")
@ -396,7 +396,9 @@ def test_router_init_gpt_4_vision_enhancements():
        pytest.fail(f"Error occurred: {e}")
-def test_openai_with_organization():
+@pytest.mark.parametrize("sync_mode", [True, False])
@pytest.mark.asyncio
 async def test_openai_with_organization(sync_mode):
    try:
        print("Testing OpenAI with organization")
        model_list = [
@ -418,32 +420,65 @@ def test_openai_with_organization():
        print(router.model_list)
        print(router.model_list[0])
-        openai_client = router._get_client(
+        if sync_mode:
-            deployment=router.model_list[0],
+            openai_client = router._get_client(
-            kwargs={"input": ["hello"], "model": "openai-bad-org"},
+                deployment=router.model_list[0],
-        )
+                kwargs={"input": ["hello"], "model": "openai-bad-org"},
        print(vars(openai_client))
        assert openai_client.organization == "org-ikDc4ex8NB"
        # bad org raises error
        try:
            response = router.completion(
                model="openai-bad-org",
                messages=[{"role": "user", "content": "this is a test"}],
            )
-            pytest.fail("Request should have failed - This organization does not exist")
+            print(vars(openai_client))
        except Exception as e:
            print("Got exception: " + str(e))
            assert "No such organization: org-ikDc4ex8NB" in str(e)
-        # good org works
+            assert openai_client.organization == "org-ikDc4ex8NB"
-        response = router.completion(
+
-            model="openai-good-org",
+            # bad org raises error
-            messages=[{"role": "user", "content": "this is a test"}],
+
-            max_tokens=5,
+            try:
-        )
+                response = router.completion(
                    model="openai-bad-org",
                    messages=[{"role": "user", "content": "this is a test"}],
                )
                pytest.fail(
                    "Request should have failed - This organization does not exist"
                )
            except Exception as e:
                print("Got exception: " + str(e))
                assert "No such organization: org-ikDc4ex8NB" in str(e)
            # good org works
            response = router.completion(
                model="openai-good-org",
                messages=[{"role": "user", "content": "this is a test"}],
                max_tokens=5,
            )
        else:
            openai_client = router._get_client(
                deployment=router.model_list[0],
                kwargs={"input": ["hello"], "model": "openai-bad-org"},
                client_type="async",
            )
            print(vars(openai_client))
            assert openai_client.organization == "org-ikDc4ex8NB"
            # bad org raises error
            try:
                response = await router.acompletion(
                    model="openai-bad-org",
                    messages=[{"role": "user", "content": "this is a test"}],
                )
                pytest.fail(
                    "Request should have failed - This organization does not exist"
                )
            except Exception as e:
                print("Got exception: " + str(e))
                assert "No such organization: org-ikDc4ex8NB" in str(e)
            # good org works
            response = await router.acompletion(
                model="openai-good-org",
                messages=[{"role": "user", "content": "this is a test"}],
                max_tokens=5,
            )
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
--- a/litellm/tests/test_router_retries.py
+++ b/litellm/tests/test_router_retries.py
@ -0,0 +1,121 @@
 #### What this tests ####
 #    This tests calling router with fallback models
 import sys, os, time
 import traceback, asyncio
 import pytest
 sys.path.insert(
    0, os.path.abspath("../..")
 )  # Adds the parent directory to the system path
 import litellm
 from litellm import Router
 from litellm.integrations.custom_logger import CustomLogger
 class MyCustomHandler(CustomLogger):
    success: bool = False
    failure: bool = False
    previous_models: int = 0
    def log_pre_api_call(self, model, messages, kwargs):
        print(f"Pre-API Call")
        print(
            f"previous_models: {kwargs['litellm_params']['metadata'].get('previous_models', None)}"
        )
        self.previous_models = len(
            kwargs["litellm_params"]["metadata"].get("previous_models", [])
        )  # {"previous_models": [{"model": litellm_model_name, "exception_type": AuthenticationError, "exception_string": <complete_traceback>}]}
        print(f"self.previous_models: {self.previous_models}")
    def log_post_api_call(self, kwargs, response_obj, start_time, end_time):
        print(
            f"Post-API Call - response object: {response_obj}; model: {kwargs['model']}"
        )
    def log_stream_event(self, kwargs, response_obj, start_time, end_time):
        print(f"On Stream")
    def async_log_stream_event(self, kwargs, response_obj, start_time, end_time):
        print(f"On Stream")
    def log_success_event(self, kwargs, response_obj, start_time, end_time):
        print(f"On Success")
    async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
        print(f"On Success")
    def log_failure_event(self, kwargs, response_obj, start_time, end_time):
        print(f"On Failure")
 """
 Test sync + async 
 - Authorization Errors 
 - Random API Error 
 """
@pytest.mark.parametrize("sync_mode", [True, False])
@pytest.mark.parametrize("error_type", ["Authorization Error", "API Error"])
@pytest.mark.asyncio
 async def test_router_retries_errors(sync_mode, error_type):
    """
    - Auth Error -> 0 retries
    - API Error -> 2 retries
    """
    _api_key = (
        "bad-key" if error_type == "Authorization Error" else os.getenv("AZURE_API_KEY")
    )
    print(f"_api_key: {_api_key}")
    model_list = [
        {
            "model_name": "azure/gpt-3.5-turbo",  # openai model name
            "litellm_params": {  # params for litellm completion/embedding call
                "model": "azure/chatgpt-functioncalling",
                "api_key": _api_key,
                "api_version": os.getenv("AZURE_API_VERSION"),
                "api_base": os.getenv("AZURE_API_BASE"),
            },
            "tpm": 240000,
            "rpm": 1800,
        },
    ]
    router = Router(model_list=model_list, allowed_fails=3)
    customHandler = MyCustomHandler()
    litellm.callbacks = [customHandler]
    user_message = "Hello, how are you?"
    messages = [{"content": user_message, "role": "user"}]
    kwargs = {
        "model": "azure/gpt-3.5-turbo",
        "messages": messages,
        "mock_response": (
            None
            if error_type == "Authorization Error"
            else Exception("Invalid Request")
        ),
    }
    try:
        if sync_mode:
            response = router.completion(**kwargs)
        else:
            response = await router.acompletion(**kwargs)
    except Exception as e:
        pass
    await asyncio.sleep(
        0.05
    )  # allow a delay as success_callbacks are on a separate thread
    print(f"customHandler.previous_models: {customHandler.previous_models}")
    if error_type == "Authorization Error":
        assert customHandler.previous_models == 0  # 0 retries
    else:
        assert customHandler.previous_models == 2  # 2 retries
--- a/litellm/tests/test_router_timeout.py
+++ b/litellm/tests/test_router_timeout.py
@ -57,6 +57,7 @@ def test_router_timeouts():
        redis_password=os.getenv("REDIS_PASSWORD"),
        redis_port=int(os.getenv("REDIS_PORT")),
        timeout=10,
        num_retries=0,
    )
    print("***** TPM SETTINGS *****")
@ -89,15 +90,15 @@ def test_router_timeouts():
@pytest.mark.asyncio
 async def test_router_timeouts_bedrock():
-    import openai
+    import openai, uuid
    # Model list for OpenAI and Anthropic models
-    model_list = [
+    _model_list = [
        {
            "model_name": "bedrock",
            "litellm_params": {
                "model": "bedrock/anthropic.claude-instant-v1",
-                "timeout": 0.001,
+                "timeout": 0.00001,
            },
            "tpm": 80000,
        },
@ -105,17 +106,18 @@ async def test_router_timeouts_bedrock():
    # Configure router
    router = Router(
-        model_list=model_list,
+        model_list=_model_list,
        routing_strategy="usage-based-routing",
        debug_level="DEBUG",
        set_verbose=True,
        num_retries=0,
    )
    litellm.set_verbose = True
    try:
        response = await router.acompletion(
            model="bedrock",
-            messages=[{"role": "user", "content": "hello, who are u"}],
+            messages=[{"role": "user", "content": f"hello, who are u {uuid.uuid4()}"}],
        )
        print(response)
        pytest.fail("Did not raise error `openai.APITimeoutError`")
--- a/litellm/tests/test_streaming.py
+++ b/litellm/tests/test_streaming.py
@ -518,7 +518,7 @@ async def test_acompletion_gemini_stream():
        litellm.set_verbose = True
        print("Streaming gemini response")
        messages = [
-            {"role": "system", "content": "You are a helpful assistant."},
+            # {"role": "system", "content": "You are a helpful assistant."},
            {
                "role": "user",
                "content": "What do you know?",
@ -1271,6 +1271,33 @@ def test_completion_sagemaker_stream():
        pytest.fail(f"Error occurred: {e}")
 def test_completion_watsonx_stream():
    litellm.set_verbose = True
    try:
        response = completion(
            model="watsonx/ibm/granite-13b-chat-v2",
            messages=messages,
            temperature=0.5,
            max_tokens=20,
            stream=True,
        )
        complete_response = ""
        has_finish_reason = False
        # Add any assertions here to check the response
        for idx, chunk in enumerate(response):
            chunk, finished = streaming_format_tests(idx, chunk)
            has_finish_reason = finished
            if finished:
                break
            complete_response += chunk
        if has_finish_reason is False:
            raise Exception("finish reason not set for last chunk")
        if complete_response.strip() == "":
            raise Exception("Empty response received")
    except Exception as e:
        pytest.fail(f"Error occurred: {e}")
 # test_completion_sagemaker_stream()
@ -2446,6 +2473,34 @@ class ModelResponseIterator:
        return self.model_response
 class ModelResponseListIterator:
    def __init__(self, model_responses):
        self.model_responses = model_responses
        self.index = 0
    # Sync iterator
    def __iter__(self):
        return self
    def __next__(self):
        if self.index >= len(self.model_responses):
            raise StopIteration
        model_response = self.model_responses[self.index]
        self.index += 1
        return model_response
    # Async iterator
    def __aiter__(self):
        return self
    async def __anext__(self):
        if self.index >= len(self.model_responses):
            raise StopAsyncIteration
        model_response = self.model_responses[self.index]
        self.index += 1
        return model_response
 def test_unit_test_custom_stream_wrapper():
    """
    Test if last streaming chunk ends with '?', if the message repeats itself.
@ -2486,3 +2541,268 @@ def test_unit_test_custom_stream_wrapper():
            if "How are you?" in chunk.choices[0].delta.content:
                freq += 1
    assert freq == 1
 def test_aamazing_unit_test_custom_stream_wrapper_n():
    """
    Test if the translated output maps exactly to the received openai input
    Relevant issue: https://github.com/BerriAI/litellm/issues/3276
    """
    chunks = [
        {
            "id": "chatcmpl-9HzZIMCtVq7CbTmdwEZrktiTeoiYe",
            "object": "chat.completion.chunk",
            "created": 1714075272,
            "model": "gpt-4-0613",
            "system_fingerprint": None,
            "choices": [
                {
                    "index": 0,
                    "delta": {"content": "It"},
                    "logprobs": {
                        "content": [
                            {
                                "token": "It",
                                "logprob": -1.5952516,
                                "bytes": [73, 116],
                                "top_logprobs": [
                                    {
                                        "token": "Brown",
                                        "logprob": -0.7358765,
                                        "bytes": [66, 114, 111, 119, 110],
                                    }
                                ],
                            }
                        ]
                    },
                    "finish_reason": None,
                }
            ],
        },
        {
            "id": "chatcmpl-9HzZIMCtVq7CbTmdwEZrktiTeoiYe",
            "object": "chat.completion.chunk",
            "created": 1714075272,
            "model": "gpt-4-0613",
            "system_fingerprint": None,
            "choices": [
                {
                    "index": 1,
                    "delta": {"content": "Brown"},
                    "logprobs": {
                        "content": [
                            {
                                "token": "Brown",
                                "logprob": -0.7358765,
                                "bytes": [66, 114, 111, 119, 110],
                                "top_logprobs": [
                                    {
                                        "token": "Brown",
                                        "logprob": -0.7358765,
                                        "bytes": [66, 114, 111, 119, 110],
                                    }
                                ],
                            }
                        ]
                    },
                    "finish_reason": None,
                }
            ],
        },
        {
            "id": "chatcmpl-9HzZIMCtVq7CbTmdwEZrktiTeoiYe",
            "object": "chat.completion.chunk",
            "created": 1714075272,
            "model": "gpt-4-0613",
            "system_fingerprint": None,
            "choices": [
                {
                    "index": 0,
                    "delta": {"content": "'s"},
                    "logprobs": {
                        "content": [
                            {
                                "token": "'s",
                                "logprob": -0.006786893,
                                "bytes": [39, 115],
                                "top_logprobs": [
                                    {
                                        "token": "'s",
                                        "logprob": -0.006786893,
                                        "bytes": [39, 115],
                                    }
                                ],
                            }
                        ]
                    },
                    "finish_reason": None,
                }
            ],
        },
        {
            "id": "chatcmpl-9HzZIMCtVq7CbTmdwEZrktiTeoiYe",
            "object": "chat.completion.chunk",
            "created": 1714075272,
            "model": "gpt-4-0613",
            "system_fingerprint": None,
            "choices": [
                {
                    "index": 0,
                    "delta": {"content": " impossible"},
                    "logprobs": {
                        "content": [
                            {
                                "token": " impossible",
                                "logprob": -0.06528423,
                                "bytes": [
                                    32,
                                    105,
                                    109,
                                    112,
                                    111,
                                    115,
                                    115,
                                    105,
                                    98,
                                    108,
                                    101,
                                ],
                                "top_logprobs": [
                                    {
                                        "token": " impossible",
                                        "logprob": -0.06528423,
                                        "bytes": [
                                            32,
                                            105,
                                            109,
                                            112,
                                            111,
                                            115,
                                            115,
                                            105,
                                            98,
                                            108,
                                            101,
                                        ],
                                    }
                                ],
                            }
                        ]
                    },
                    "finish_reason": None,
                }
            ],
        },
        {
            "id": "chatcmpl-9HzZIMCtVq7CbTmdwEZrktiTeoiYe",
            "object": "chat.completion.chunk",
            "created": 1714075272,
            "model": "gpt-4-0613",
            "system_fingerprint": None,
            "choices": [
                {
                    "index": 0,
                    "delta": {"content": "—even"},
                    "logprobs": {
                        "content": [
                            {
                                "token": "—even",
                                "logprob": -9999.0,
                                "bytes": [226, 128, 148, 101, 118, 101, 110],
                                "top_logprobs": [
                                    {
                                        "token": " to",
                                        "logprob": -0.12302828,
                                        "bytes": [32, 116, 111],
                                    }
                                ],
                            }
                        ]
                    },
                    "finish_reason": None,
                }
            ],
        },
        {
            "id": "chatcmpl-9HzZIMCtVq7CbTmdwEZrktiTeoiYe",
            "object": "chat.completion.chunk",
            "created": 1714075272,
            "model": "gpt-4-0613",
            "system_fingerprint": None,
            "choices": [
                {"index": 0, "delta": {}, "logprobs": None, "finish_reason": "length"}
            ],
        },
        {
            "id": "chatcmpl-9HzZIMCtVq7CbTmdwEZrktiTeoiYe",
            "object": "chat.completion.chunk",
            "created": 1714075272,
            "model": "gpt-4-0613",
            "system_fingerprint": None,
            "choices": [
                {"index": 1, "delta": {}, "logprobs": None, "finish_reason": "stop"}
            ],
        },
    ]
    litellm.set_verbose = True
    chunk_list = []
    for chunk in chunks:
        new_chunk = litellm.ModelResponse(stream=True, id=chunk["id"])
        if "choices" in chunk and isinstance(chunk["choices"], list):
            print("INSIDE CHUNK CHOICES!")
            new_choices = []
            for choice in chunk["choices"]:
                if isinstance(choice, litellm.utils.StreamingChoices):
                    _new_choice = choice
                elif isinstance(choice, dict):
                    _new_choice = litellm.utils.StreamingChoices(**choice)
                new_choices.append(_new_choice)
            new_chunk.choices = new_choices
        chunk_list.append(new_chunk)
    completion_stream = ModelResponseListIterator(model_responses=chunk_list)
    response = litellm.CustomStreamWrapper(
        completion_stream=completion_stream,
        model="gpt-4-0613",
        custom_llm_provider="cached_response",
        logging_obj=litellm.Logging(
            model="gpt-4-0613",
            messages=[{"role": "user", "content": "Hey"}],
            stream=True,
            call_type="completion",
            start_time=time.time(),
            litellm_call_id="12345",
            function_id="1245",
        ),
    )
    for idx, chunk in enumerate(response):
        chunk_dict = {}
        try:
            chunk_dict = chunk.model_dump(exclude_none=True)
        except:
            chunk_dict = chunk.dict(exclude_none=True)
        chunk_dict.pop("created")
        chunks[idx].pop("created")
        if chunks[idx]["system_fingerprint"] is None:
            chunks[idx].pop("system_fingerprint", None)
        if idx == 0:
            for choice in chunk_dict["choices"]:
                if "role" in choice["delta"]:
                    choice["delta"].pop("role")
        for choice in chunks[idx]["choices"]:
            # ignore finish reason None - since our pydantic object is set to exclude_none = true
            if "finish_reason" in choice and choice["finish_reason"] is None:
                choice.pop("finish_reason")
            if "logprobs" in choice and choice["logprobs"] is None:
                choice.pop("logprobs")
        assert (
            chunk_dict == chunks[idx]
        ), f"idx={idx} translated chunk = {chunk_dict} != openai chunk = {chunks[idx]}"
--- a/litellm/tests/test_timeout.py
+++ b/litellm/tests/test_timeout.py
@ -78,7 +78,8 @@ def test_hanging_request_azure():
                    "model_name": "openai-gpt",
                    "litellm_params": {"model": "gpt-3.5-turbo"},
                },
-            ]
+            ],
            num_retries=0,
        )
        encoded = litellm.utils.encode(model="gpt-3.5-turbo", text="blue")[0]
@ -131,7 +132,8 @@ def test_hanging_request_openai():
                    "model_name": "openai-gpt",
                    "litellm_params": {"model": "gpt-3.5-turbo"},
                },
-            ]
+            ],
            num_retries=0,
        )
        encoded = litellm.utils.encode(model="gpt-3.5-turbo", text="blue")[0]
@ -189,6 +191,7 @@ def test_timeout_streaming():
 # test_timeout_streaming()
@pytest.mark.skip(reason="local test")
 def test_timeout_ollama():
    # this Will Raise a timeout
    import litellm
--- a/litellm/tests/test_tpm_rpm_routing_v2.py
+++ b/litellm/tests/test_tpm_rpm_routing_v2.py
@ -282,6 +282,64 @@ def test_router_skip_rate_limited_deployments():
        print(f"An exception occurred! {str(e)}")
@pytest.mark.parametrize("sync_mode", [True, False])
@pytest.mark.asyncio
 async def test_multiple_potential_deployments(sync_mode):
    """
    If multiple deployments have the same tpm value
    call 5 times, test if deployments are shuffled.
    -> prevents single deployment from being overloaded in high-concurrency scenario
    """
    model_list = [
        {
            "model_name": "azure-model",
            "litellm_params": {
                "model": "azure/gpt-turbo",
                "api_key": "os.environ/AZURE_FRANCE_API_KEY",
                "api_base": "https://openai-france-1234.openai.azure.com",
                "tpm": 1440,
            },
        },
        {
            "model_name": "azure-model",
            "litellm_params": {
                "model": "azure/gpt-turbo-2",
                "api_key": "os.environ/AZURE_FRANCE_API_KEY",
                "api_base": "https://openai-france-1234.openai.azure.com",
                "tpm": 1440,
            },
        },
    ]
    router = Router(
        model_list=model_list,
        routing_strategy="usage-based-routing-v2",
        set_verbose=False,
        num_retries=3,
    )  # type: ignore
    model_ids = set()
    for _ in range(1000):
        if sync_mode:
            deployment = router.get_available_deployment(
                model="azure-model",
                messages=[{"role": "user", "content": "Hey, how's it going?"}],
            )
        else:
            deployment = await router.async_get_available_deployment(
                model="azure-model",
                messages=[{"role": "user", "content": "Hey, how's it going?"}],
            )
        ## get id ##
        id = deployment.get("model_info", {}).get("id")
        model_ids.add(id)
    assert len(model_ids) == 2
 def test_single_deployment_tpm_zero():
    import litellm
    import os
--- a/litellm/types/router.py
+++ b/litellm/types/router.py
@ -1,5 +1,5 @@
 from typing import List, Optional, Union, Dict, Tuple, Literal
-
+import httpx
 from pydantic import BaseModel, validator
 from .completion import CompletionRequest
 from .embedding import EmbeddingRequest
@ -104,11 +104,13 @@ class LiteLLM_Params(BaseModel):
    api_key: Optional[str] = None
    api_base: Optional[str] = None
    api_version: Optional[str] = None
-    timeout: Optional[Union[float, str]] = None  # if str, pass in as os.environ/
+    timeout: Optional[Union[float, str, httpx.Timeout]] = (
        None  # if str, pass in as os.environ/
    )
    stream_timeout: Optional[Union[float, str]] = (
        None  # timeout when making stream=True calls, if str, pass in as os.environ/
    )
-    max_retries: int = 2  # follows openai default of 2
+    max_retries: Optional[int] = None
    organization: Optional[str] = None  # for openai orgs
    ## VERTEX AI ##
    vertex_project: Optional[str] = None
@ -146,14 +148,13 @@ class LiteLLM_Params(BaseModel):
        args.pop("self", None)
        args.pop("params", None)
        args.pop("__class__", None)
-        if max_retries is None:
+        if max_retries is not None and isinstance(max_retries, str):
            max_retries = 2
        elif isinstance(max_retries, str):
            max_retries = int(max_retries)  # cast to int
        super().__init__(max_retries=max_retries, **args, **params)
    class Config:
        extra = "allow"
        arbitrary_types_allowed = True
    def __contains__(self, key):
        # Define custom behavior for the 'in' operator
@ -201,6 +202,9 @@ class updateDeployment(BaseModel):
    litellm_params: Optional[updateLiteLLMParams] = None
    model_info: Optional[ModelInfo] = None
    class Config:
        protected_namespaces = ()
 class Deployment(BaseModel):
    model_name: str
@ -259,3 +263,4 @@ class RouterErrors(enum.Enum):
    """
    user_defined_ratelimit_error = "Deployment over user-defined ratelimit."
    no_deployments_available = "No deployments available for selected model"
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -19,6 +19,7 @@ from functools import wraps
 import datetime, time
 import tiktoken
 import uuid
 from pydantic import BaseModel
 import aiohttp
 import textwrap
 import logging
@ -69,6 +70,7 @@ from .integrations.langsmith import LangsmithLogger
 from .integrations.weights_biases import WeightsBiasesLogger
 from .integrations.custom_logger import CustomLogger
 from .integrations.langfuse import LangFuseLogger
 from .integrations.openmeter import OpenMeterLogger
 from .integrations.datadog import DataDogLogger
 from .integrations.prometheus import PrometheusLogger
 from .integrations.prometheus_services import PrometheusServicesLogger
@ -105,7 +107,7 @@ try:
 except Exception as e:
    verbose_logger.debug(f"Exception import enterprise features {str(e)}")
-from typing import cast, List, Dict, Union, Optional, Literal, Any, BinaryIO
+from typing import cast, List, Dict, Union, Optional, Literal, Any, BinaryIO, Iterable
 from .caching import Cache
 from concurrent.futures import ThreadPoolExecutor
@ -129,6 +131,7 @@ langsmithLogger = None
 weightsBiasesLogger = None
 customLogger = None
 langFuseLogger = None
 openMeterLogger = None
 dataDogLogger = None
 prometheusLogger = None
 dynamoLogger = None
@ -219,6 +222,61 @@ def map_finish_reason(
    return finish_reason
 class TopLogprob(OpenAIObject):
    token: str
    """The token."""
    bytes: Optional[List[int]] = None
    """A list of integers representing the UTF-8 bytes representation of the token.
    Useful in instances where characters are represented by multiple tokens and
    their byte representations must be combined to generate the correct text
    representation. Can be `null` if there is no bytes representation for the token.
    """
    logprob: float
    """The log probability of this token, if it is within the top 20 most likely
    tokens.
    Otherwise, the value `-9999.0` is used to signify that the token is very
    unlikely.
    """
 class ChatCompletionTokenLogprob(OpenAIObject):
    token: str
    """The token."""
    bytes: Optional[List[int]] = None
    """A list of integers representing the UTF-8 bytes representation of the token.
    Useful in instances where characters are represented by multiple tokens and
    their byte representations must be combined to generate the correct text
    representation. Can be `null` if there is no bytes representation for the token.
    """
    logprob: float
    """The log probability of this token, if it is within the top 20 most likely
    tokens.
    Otherwise, the value `-9999.0` is used to signify that the token is very
    unlikely.
    """
    top_logprobs: List[TopLogprob]
    """List of the most likely tokens and their log probability, at this token
    position.
    In rare cases, there may be fewer than the number of requested `top_logprobs`
    returned.
    """
 class ChoiceLogprobs(OpenAIObject):
    content: Optional[List[ChatCompletionTokenLogprob]] = None
    """A list of message content tokens with log probability information."""
 class FunctionCall(OpenAIObject):
    arguments: str
    name: Optional[str] = None
@ -320,19 +378,19 @@ class Message(OpenAIObject):
        super(Message, self).__init__(**params)
        self.content = content
        self.role = role
        self.tool_calls = None
        self.function_call = None
        if function_call is not None:
            self.function_call = FunctionCall(**function_call)
        if tool_calls is not None:
-            self.tool_calls = []
+            self.tool_calls = [
-            for tool_call in tool_calls:
+                ChatCompletionMessageToolCall(**tool_call) for tool_call in tool_calls
-                if isinstance(tool_call, dict):
+            ]
                    self.tool_calls.append(ChatCompletionMessageToolCall(**tool_call))
                else:
                    self.tool_calls.append(tool_call)
        if logprobs is not None:
-            self._logprobs = logprobs
+            self._logprobs = ChoiceLogprobs(**logprobs)
    def get(self, key, default=None):
        # Custom .get() method to access attributes with a default value if the attribute doesn't exist
@ -355,12 +413,20 @@ class Message(OpenAIObject):
 class Delta(OpenAIObject):
    tool_calls: Optional[List[ChatCompletionDeltaToolCall]] = None
    def __init__(
-        self, content=None, role=None, function_call=None, tool_calls=None, **params
+        self,
        content=None,
        role=None,
        function_call=None,
        tool_calls=None,
        **params,
    ):
        super(Delta, self).__init__(**params)
        self.content = content
        self.role = role
        if function_call is not None and isinstance(function_call, dict):
            self.function_call = FunctionCall(**function_call)
        else:
@ -410,7 +476,7 @@ class Choices(OpenAIObject):
        )  # set finish_reason for all responses
        self.index = index
        if message is None:
-            self.message = Message(content=None)
+            self.message = Message()
        else:
            if isinstance(message, Message):
                self.message = message
@ -492,7 +558,11 @@ class StreamingChoices(OpenAIObject):
            self.delta = Delta()
        if enhancements is not None:
            self.enhancements = enhancements
-        self.logprobs = logprobs
+
        if logprobs is not None and isinstance(logprobs, dict):
            self.logprobs = ChoiceLogprobs(**logprobs)
        else:
            self.logprobs = logprobs  # type: ignore
    def __contains__(self, key):
        # Define custom behavior for the 'in' operator
@ -1139,7 +1209,14 @@ class Logging:
            if verbose_logger.level == 0:
                # this means verbose logger was not switched on - user is in litellm.set_verbose=True
                print_verbose(f"\033[92m{curl_command}\033[0m\n")
-            verbose_logger.info(f"\033[92m{curl_command}\033[0m\n")
+
            if litellm.json_logs:
                verbose_logger.info(
                    "POST Request Sent from LiteLLM",
                    extra={"api_base": {api_base}, **masked_headers},
                )
            else:
                verbose_logger.info(f"\033[92m{curl_command}\033[0m\n")
            if self.logger_fn and callable(self.logger_fn):
                try:
                    self.logger_fn(
@ -1149,7 +1226,6 @@ class Logging:
                    print_verbose(
                        f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while logging {traceback.format_exc()}"
                    )
            # Input Integration Logging -> If you want to log the fact that an attempt to call the model was made
            callbacks = litellm.input_callback + self.dynamic_input_callbacks
            for callback in callbacks:
@ -1166,29 +1242,20 @@ class Logging:
                            litellm_call_id=self.litellm_params["litellm_call_id"],
                            print_verbose=print_verbose,
                        )
                    elif callback == "lite_debugger":
                        print_verbose(
                            f"reaches litedebugger for logging! - model_call_details {self.model_call_details}"
                        )
                        model = self.model_call_details["model"]
                        messages = self.model_call_details["input"]
                        print_verbose(f"liteDebuggerClient: {liteDebuggerClient}")
                        liteDebuggerClient.input_log_event(
                            model=model,
                            messages=messages,
                            end_user=self.model_call_details.get("user", "default"),
                            litellm_call_id=self.litellm_params["litellm_call_id"],
                            litellm_params=self.model_call_details["litellm_params"],
                            optional_params=self.model_call_details["optional_params"],
                            print_verbose=print_verbose,
                            call_type=self.call_type,
                        )
                    elif callback == "sentry" and add_breadcrumb:
-                        print_verbose("reaches sentry breadcrumbing")
+                        try:
                            details_to_log = copy.deepcopy(self.model_call_details)
                        except:
                            details_to_log = self.model_call_details
                        if litellm.turn_off_message_logging:
                            # make a copy of the _model_Call_details and log it
                            details_to_log.pop("messages", None)
                            details_to_log.pop("input", None)
                            details_to_log.pop("prompt", None)
                        add_breadcrumb(
                            category="litellm.llm_call",
-                            message=f"Model Call Details pre-call: {self.model_call_details}",
+                            message=f"Model Call Details pre-call: {details_to_log}",
                            level="info",
                        )
                    elif isinstance(callback, CustomLogger):  # custom logger class
@ -1252,7 +1319,7 @@ class Logging:
                    print_verbose(
                        f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while logging {traceback.format_exc()}"
                    )
-
+            self.redact_message_input_output_from_logging(result=original_response)
            # Input Integration Logging -> If you want to log the fact that an attempt to call the model was made
            callbacks = litellm.input_callback + self.dynamic_input_callbacks
@ -1270,9 +1337,19 @@ class Logging:
                        )
                    elif callback == "sentry" and add_breadcrumb:
                        print_verbose("reaches sentry breadcrumbing")
                        try:
                            details_to_log = copy.deepcopy(self.model_call_details)
                        except:
                            details_to_log = self.model_call_details
                        if litellm.turn_off_message_logging:
                            # make a copy of the _model_Call_details and log it
                            details_to_log.pop("messages", None)
                            details_to_log.pop("input", None)
                            details_to_log.pop("prompt", None)
                        add_breadcrumb(
                            category="litellm.llm_call",
-                            message=f"Model Call Details post-call: {self.model_call_details}",
+                            message=f"Model Call Details post-call: {details_to_log}",
                            level="info",
                        )
                    elif isinstance(callback, CustomLogger):  # custom logger class
@ -1464,6 +1541,8 @@ class Logging:
            else:
                callbacks = litellm.success_callback
            self.redact_message_input_output_from_logging(result=result)
            for callback in callbacks:
                try:
                    litellm_params = self.model_call_details.get("litellm_params", {})
@ -1850,6 +1929,51 @@ class Logging:
                                end_time=end_time,
                                print_verbose=print_verbose,
                            )
                    if (
                        callback == "openmeter"
                        and self.model_call_details.get("litellm_params", {}).get(
                            "acompletion", False
                        )
                        == False
                        and self.model_call_details.get("litellm_params", {}).get(
                            "aembedding", False
                        )
                        == False
                        and self.model_call_details.get("litellm_params", {}).get(
                            "aimage_generation", False
                        )
                        == False
                        and self.model_call_details.get("litellm_params", {}).get(
                            "atranscription", False
                        )
                        == False
                    ):
                        global openMeterLogger
                        if openMeterLogger is None:
                            print_verbose("Instantiates openmeter client")
                            openMeterLogger = OpenMeterLogger()
                        if self.stream and complete_streaming_response is None:
                            openMeterLogger.log_stream_event(
                                kwargs=self.model_call_details,
                                response_obj=result,
                                start_time=start_time,
                                end_time=end_time,
                            )
                        else:
                            if self.stream and complete_streaming_response:
                                self.model_call_details["complete_response"] = (
                                    self.model_call_details.get(
                                        "complete_streaming_response", {}
                                    )
                                )
                                result = self.model_call_details["complete_response"]
                            openMeterLogger.log_success_event(
                                kwargs=self.model_call_details,
                                response_obj=result,
                                start_time=start_time,
                                end_time=end_time,
                            )
                    if (
                        isinstance(callback, CustomLogger)
                        and self.model_call_details.get("litellm_params", {}).get(
@ -2008,7 +2132,9 @@ class Logging:
                    callbacks.append(callback)
        else:
            callbacks = litellm._async_success_callback
-        print_verbose(f"Async success callbacks: {callbacks}")
+
        self.redact_message_input_output_from_logging(result=result)
        for callback in callbacks:
            # check if callback can run for this request
            litellm_params = self.model_call_details.get("litellm_params", {})
@ -2046,6 +2172,35 @@ class Logging:
                                await litellm.cache.async_add_cache(result, **kwargs)
                            else:
                                litellm.cache.add_cache(result, **kwargs)
                if callback == "openmeter":
                    global openMeterLogger
                    if self.stream == True:
                        if (
                            "async_complete_streaming_response"
                            in self.model_call_details
                        ):
                            await openMeterLogger.async_log_success_event(
                                kwargs=self.model_call_details,
                                response_obj=self.model_call_details[
                                    "async_complete_streaming_response"
                                ],
                                start_time=start_time,
                                end_time=end_time,
                            )
                        else:
                            await openMeterLogger.async_log_stream_event(  # [TODO]: move this to being an async log stream event function
                                kwargs=self.model_call_details,
                                response_obj=result,
                                start_time=start_time,
                                end_time=end_time,
                            )
                    else:
                        await openMeterLogger.async_log_success_event(
                            kwargs=self.model_call_details,
                            response_obj=result,
                            start_time=start_time,
                            end_time=end_time,
                        )
                if isinstance(callback, CustomLogger):  # custom logger class
                    if self.stream == True:
                        if (
@ -2169,7 +2324,10 @@ class Logging:
                start_time=start_time,
                end_time=end_time,
            )
            result = None  # result sent to all loggers, init this to None incase it's not created
            self.redact_message_input_output_from_logging(result=result)
            for callback in litellm.failure_callback:
                try:
                    if callback == "lite_debugger":
@ -2354,6 +2512,39 @@ class Logging:
                    f"LiteLLM.LoggingError: [Non-Blocking] Exception occurred while success logging {traceback.format_exc()}"
                )
    def redact_message_input_output_from_logging(self, result):
        """
        Removes messages, prompts, input, response from logging. This modifies the data in-place
        only redacts when litellm.turn_off_message_logging == True
        """
        # check if user opted out of logging message/response to callbacks
        if litellm.turn_off_message_logging == True:
            # remove messages, prompts, input, response from logging
            self.model_call_details["messages"] = "redacted-by-litellm"
            self.model_call_details["prompt"] = ""
            self.model_call_details["input"] = ""
            # response cleaning
            # ChatCompletion Responses
            if self.stream and "complete_streaming_response" in self.model_call_details:
                _streaming_response = self.model_call_details[
                    "complete_streaming_response"
                ]
                for choice in _streaming_response.choices:
                    if isinstance(choice, litellm.Choices):
                        choice.message.content = "redacted-by-litellm"
                    elif isinstance(choice, litellm.utils.StreamingChoices):
                        choice.delta.content = "redacted-by-litellm"
            else:
                if result is not None:
                    if isinstance(result, litellm.ModelResponse):
                        if hasattr(result, "choices") and result.choices is not None:
                            for choice in result.choices:
                                if isinstance(choice, litellm.Choices):
                                    choice.message.content = "redacted-by-litellm"
                                elif isinstance(choice, litellm.utils.StreamingChoices):
                                    choice.delta.content = "redacted-by-litellm"
 def exception_logging(
    additional_args={},
@ -2436,7 +2627,7 @@ class Rules:
 ####### CLIENT ###################
 # make it easy to log if completion/embedding runs succeeded or failed + see what happened | Non-Blocking
 def function_setup(
-    original_function, rules_obj, start_time, *args, **kwargs
+    original_function: str, rules_obj, start_time, *args, **kwargs
 ):  # just run once to check if user wants to send their data anywhere - PostHog/Sentry/Slack/etc.
    try:
        global callback_list, add_breadcrumb, user_logger_fn, Logging
@ -2460,10 +2651,12 @@ def function_setup(
            len(litellm.input_callback) > 0
            or len(litellm.success_callback) > 0
            or len(litellm.failure_callback) > 0
-        ) and len(callback_list) == 0:
+        ) and len(
            callback_list  # type: ignore
        ) == 0:  # type: ignore
            callback_list = list(
                set(
-                    litellm.input_callback
+                    litellm.input_callback  # type: ignore
                    + litellm.success_callback
                    + litellm.failure_callback
                )
@ -2472,7 +2665,7 @@ def function_setup(
        ## ASYNC CALLBACKS
        if len(litellm.input_callback) > 0:
            removed_async_items = []
-            for index, callback in enumerate(litellm.input_callback):
+            for index, callback in enumerate(litellm.input_callback):  # type: ignore
                if inspect.iscoroutinefunction(callback):
                    litellm._async_input_callback.append(callback)
                    removed_async_items.append(index)
@ -2483,11 +2676,11 @@ def function_setup(
        if len(litellm.success_callback) > 0:
            removed_async_items = []
-            for index, callback in enumerate(litellm.success_callback):
+            for index, callback in enumerate(litellm.success_callback):  # type: ignore
                if inspect.iscoroutinefunction(callback):
                    litellm._async_success_callback.append(callback)
                    removed_async_items.append(index)
-                elif callback == "dynamodb":
+                elif callback == "dynamodb" or callback == "openmeter":
                    # dynamo is an async callback, it's used for the proxy and needs to be async
                    # we only support async dynamo db logging for acompletion/aembedding since that's used on proxy
                    litellm._async_success_callback.append(callback)
@ -2499,7 +2692,7 @@ def function_setup(
        if len(litellm.failure_callback) > 0:
            removed_async_items = []
-            for index, callback in enumerate(litellm.failure_callback):
+            for index, callback in enumerate(litellm.failure_callback):  # type: ignore
                if inspect.iscoroutinefunction(callback):
                    litellm._async_failure_callback.append(callback)
                    removed_async_items.append(index)
@ -2533,16 +2726,26 @@ def function_setup(
            dynamic_success_callbacks = kwargs.pop("success_callback")
        if add_breadcrumb:
            try:
                details_to_log = copy.deepcopy(kwargs)
            except:
                details_to_log = kwargs
            if litellm.turn_off_message_logging:
                # make a copy of the _model_Call_details and log it
                details_to_log.pop("messages", None)
                details_to_log.pop("input", None)
                details_to_log.pop("prompt", None)
            add_breadcrumb(
                category="litellm.llm_call",
-                message=f"Positional Args: {args}, Keyword Args: {kwargs}",
+                message=f"Positional Args: {args}, Keyword Args: {details_to_log}",
                level="info",
            )
        if "logger_fn" in kwargs:
            user_logger_fn = kwargs["logger_fn"]
        # INIT LOGGER - for user-specified integrations
        model = args[0] if len(args) > 0 else kwargs.get("model", None)
-        call_type = original_function.__name__
+        call_type = original_function
        if (
            call_type == CallTypes.completion.value
            or call_type == CallTypes.acompletion.value
@ -2724,7 +2927,7 @@ def client(original_function):
        try:
            if logging_obj is None:
                logging_obj, kwargs = function_setup(
-                    original_function, rules_obj, start_time, *args, **kwargs
+                    original_function.__name__, rules_obj, start_time, *args, **kwargs
                )
            kwargs["litellm_logging_obj"] = logging_obj
@ -3033,7 +3236,7 @@ def client(original_function):
        try:
            if logging_obj is None:
                logging_obj, kwargs = function_setup(
-                    original_function, rules_obj, start_time, *args, **kwargs
+                    original_function.__name__, rules_obj, start_time, *args, **kwargs
                )
            kwargs["litellm_logging_obj"] = logging_obj
@ -3540,12 +3743,12 @@ def get_replicate_completion_pricing(completion_response=None, total_time=0.0):
    a100_80gb_price_per_second_public = (
        0.001400  # assume all calls sent to A100 80GB for now
    )
-    if total_time == 0.0:
+    if total_time == 0.0:  # total time is in ms
        start_time = completion_response["created"]
        end_time = completion_response["ended"]
        total_time = end_time - start_time
-    return a100_80gb_price_per_second_public * total_time
+    return a100_80gb_price_per_second_public * total_time / 1000
 def _select_tokenizer(model: str):
@ -3567,7 +3770,7 @@ def _select_tokenizer(model: str):
        tokenizer = Tokenizer.from_str(json_str)
        return {"type": "huggingface_tokenizer", "tokenizer": tokenizer}
    # llama2
-    elif "llama-2" in model.lower():
+    elif "llama-2" in model.lower() or "replicate" in model.lower():
        tokenizer = Tokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
        return {"type": "huggingface_tokenizer", "tokenizer": tokenizer}
    # default - tiktoken
@ -4168,7 +4371,10 @@ def completion_cost(
            model = get_model_params_and_category(model)
        # replicate llms are calculate based on time for request running
        # see https://replicate.com/pricing
-        elif model in litellm.replicate_models or "replicate" in model:
+        elif (
            model in litellm.replicate_models or "replicate" in model
        ) and model not in litellm.model_cost:
            # for unmapped replicate model, default to replicate's time tracking logic
            return get_replicate_completion_pricing(completion_response, total_time)
        (
@ -4554,7 +4760,36 @@ def get_optional_params(
            k.startswith("vertex_") and custom_llm_provider != "vertex_ai"
        ):  # allow dynamically setting vertex ai init logic
            continue
        passed_params[k] = v
    optional_params = {}
    common_auth_dict = litellm.common_cloud_provider_auth_params
    if custom_llm_provider in common_auth_dict["providers"]:
        """
        Check if params = ["project", "region_name", "token"]
        and correctly translate for = ["azure", "vertex_ai", "watsonx", "aws"]
        """
        if custom_llm_provider == "azure":
            optional_params = litellm.AzureOpenAIConfig().map_special_auth_params(
                non_default_params=passed_params, optional_params=optional_params
            )
        elif custom_llm_provider == "bedrock":
            optional_params = (
                litellm.AmazonBedrockGlobalConfig().map_special_auth_params(
                    non_default_params=passed_params, optional_params=optional_params
                )
            )
        elif custom_llm_provider == "vertex_ai":
            optional_params = litellm.VertexAIConfig().map_special_auth_params(
                non_default_params=passed_params, optional_params=optional_params
            )
        elif custom_llm_provider == "watsonx":
            optional_params = litellm.IBMWatsonXAIConfig().map_special_auth_params(
                non_default_params=passed_params, optional_params=optional_params
            )
    default_params = {
        "functions": None,
        "function_call": None,
@ -4590,7 +4825,7 @@ def get_optional_params(
            and v != default_params[k]
        )
    }
-    optional_params = {}
+
    ## raise exception if function calling passed in for a provider that doesn't support it
    if (
        "functions" in non_default_params
@ -5268,7 +5503,8 @@ def get_optional_params(
            optional_params["tools"] = tools
        if tool_choice is not None:
            optional_params["tool_choice"] = tool_choice
-
+        if response_format is not None:
            optional_params["response_format"] = response_format
        # check safe_mode, random_seed: https://docs.mistral.ai/api/#operation/createChatCompletion
        safe_mode = passed_params.pop("safe_mode", None)
        random_seed = passed_params.pop("random_seed", None)
@ -5280,6 +5516,7 @@ def get_optional_params(
        optional_params["extra_body"] = (
            extra_body  # openai client supports `extra_body` param
        )
    elif custom_llm_provider == "groq":
        supported_params = get_supported_openai_params(
            model=model, custom_llm_provider=custom_llm_provider
@ -5360,6 +5597,49 @@ def get_optional_params(
        optional_params["extra_body"] = (
            extra_body  # openai client supports `extra_body` param
        )
    elif custom_llm_provider == "watsonx":
        supported_params = get_supported_openai_params(
            model=model, custom_llm_provider=custom_llm_provider
        )
        _check_valid_arg(supported_params=supported_params)
        if max_tokens is not None:
            optional_params["max_new_tokens"] = max_tokens
        if stream:
            optional_params["stream"] = stream
        if temperature is not None:
            optional_params["temperature"] = temperature
        if top_p is not None:
            optional_params["top_p"] = top_p
        if frequency_penalty is not None:
            optional_params["repetition_penalty"] = frequency_penalty
        if seed is not None:
            optional_params["random_seed"] = seed
        if stop is not None:
            optional_params["stop_sequences"] = stop
        # WatsonX-only parameters
        extra_body = {}
        if "decoding_method" in passed_params:
            extra_body["decoding_method"] = passed_params.pop("decoding_method")
        if "min_tokens" in passed_params or "min_new_tokens" in passed_params:
            extra_body["min_new_tokens"] = passed_params.pop(
                "min_tokens", passed_params.pop("min_new_tokens")
            )
        if "top_k" in passed_params:
            extra_body["top_k"] = passed_params.pop("top_k")
        if "truncate_input_tokens" in passed_params:
            extra_body["truncate_input_tokens"] = passed_params.pop(
                "truncate_input_tokens"
            )
        if "length_penalty" in passed_params:
            extra_body["length_penalty"] = passed_params.pop("length_penalty")
        if "time_limit" in passed_params:
            extra_body["time_limit"] = passed_params.pop("time_limit")
        if "return_options" in passed_params:
            extra_body["return_options"] = passed_params.pop("return_options")
        optional_params["extra_body"] = (
            extra_body  # openai client supports `extra_body` param
        )
    else:  # assume passing in params for openai/azure openai
        print_verbose(
            f"UNMAPPED PROVIDER, ASSUMING IT'S OPENAI/AZURE - model={model}, custom_llm_provider={custom_llm_provider}"
@ -5762,6 +6042,8 @@ def get_supported_openai_params(model: str, custom_llm_provider: str):
            "frequency_penalty",
            "presence_penalty",
        ]
    elif custom_llm_provider == "watsonx":
        return litellm.IBMWatsonXAIConfig().get_supported_openai_params()
 def get_formatted_prompt(
@ -5989,6 +6271,8 @@ def get_llm_provider(
            model in litellm.bedrock_models or model in litellm.bedrock_embedding_models
        ):
            custom_llm_provider = "bedrock"
        elif model in litellm.watsonx_models:
            custom_llm_provider = "watsonx"
        # openai embeddings
        elif model in litellm.open_ai_embedding_models:
            custom_llm_provider = "openai"
@ -6453,7 +6737,7 @@ def validate_environment(model: Optional[str] = None) -> dict:
            if "VERTEXAI_PROJECT" in os.environ and "VERTEXAI_LOCATION" in os.environ:
                keys_in_environment = True
            else:
-                missing_keys.extend(["VERTEXAI_PROJECT", "VERTEXAI_PROJECT"])
+                missing_keys.extend(["VERTEXAI_PROJECT", "VERTEXAI_LOCATION"])
        elif custom_llm_provider == "huggingface":
            if "HUGGINGFACE_API_KEY" in os.environ:
                keys_in_environment = True
@ -6579,11 +6863,11 @@ def validate_environment(model: Optional[str] = None) -> dict:
 def set_callbacks(callback_list, function_id=None):
-    global sentry_sdk_instance, capture_exception, add_breadcrumb, posthog, slack_app, alerts_channel, traceloopLogger, athinaLogger, heliconeLogger, aispendLogger, berrispendLogger, supabaseClient, liteDebuggerClient, lunaryLogger, promptLayerLogger, langFuseLogger, customLogger, weightsBiasesLogger, langsmithLogger, dynamoLogger, s3Logger, dataDogLogger, prometheusLogger, greenscaleLogger
+    global sentry_sdk_instance, capture_exception, add_breadcrumb, posthog, slack_app, alerts_channel, traceloopLogger, athinaLogger, heliconeLogger, aispendLogger, berrispendLogger, supabaseClient, liteDebuggerClient, lunaryLogger, promptLayerLogger, langFuseLogger, customLogger, weightsBiasesLogger, langsmithLogger, dynamoLogger, s3Logger, dataDogLogger, prometheusLogger, greenscaleLogger, openMeterLogger
    try:
        for callback in callback_list:
-            print_verbose(f"callback: {callback}")
+            print_verbose(f"init callback list: {callback}")
            if callback == "sentry":
                try:
                    import sentry_sdk
@ -6646,6 +6930,8 @@ def set_callbacks(callback_list, function_id=None):
                promptLayerLogger = PromptLayerLogger()
            elif callback == "langfuse":
                langFuseLogger = LangFuseLogger()
            elif callback == "openmeter":
                openMeterLogger = OpenMeterLogger()
            elif callback == "datadog":
                dataDogLogger = DataDogLogger()
            elif callback == "prometheus":
@ -6982,6 +7268,7 @@ def convert_to_model_response_object(
    end_time=None,
    hidden_params: Optional[dict] = None,
 ):
    received_args = locals()
    try:
        if response_type == "completion" and (
            model_response_object is None
@ -6993,6 +7280,11 @@ def convert_to_model_response_object(
                # for returning cached responses, we need to yield a generator
                return convert_to_streaming_response(response_object=response_object)
            choice_list = []
            assert response_object["choices"] is not None and isinstance(
                response_object["choices"], Iterable
            )
            for idx, choice in enumerate(response_object["choices"]):
                message = Message(
                    content=choice["message"].get("content", None),
@ -7036,9 +7328,10 @@ def convert_to_model_response_object(
                model_response_object.model = response_object["model"]
            if start_time is not None and end_time is not None:
-                model_response_object._response_ms = (  # type: ignore
+                if isinstance(start_time, type(end_time)):
-                    end_time - start_time
+                    model_response_object._response_ms = (  # type: ignore
-                ).total_seconds() * 1000
+                        end_time - start_time
                    ).total_seconds() * 1000
            if hidden_params is not None:
                model_response_object._hidden_params = hidden_params
@ -7113,7 +7406,9 @@ def convert_to_model_response_object(
                model_response_object._hidden_params = hidden_params
            return model_response_object
    except Exception as e:
-        raise Exception(f"Invalid response object {traceback.format_exc()}")
+        raise Exception(
            f"Invalid response object {traceback.format_exc()}\n\nreceived_args={received_args}"
        )
 def acreate(*args, **kwargs):  ## Thin client to handle the acreate langchain call
@ -7940,7 +8235,10 @@ def exception_type(
                        llm_provider="vertex_ai",
                        response=original_exception.response,
                    )
-                elif "None Unknown Error." in error_str:
+                elif (
                    "None Unknown Error." in error_str
                    or "Content has no parts." in error_str
                ):
                    exception_mapping_worked = True
                    raise APIError(
                        message=f"VertexAIException - {error_str}",
@ -9393,9 +9691,14 @@ class CustomStreamWrapper:
                    is_finished = True
                    finish_reason = str_line.choices[0].finish_reason
                    if finish_reason == "content_filter":
-                        error_message = json.dumps(
+                        if hasattr(str_line.choices[0], "content_filter_result"):
-                            str_line.choices[0].content_filter_result
+                            error_message = json.dumps(
-                        )
+                                str_line.choices[0].content_filter_result
                            )
                        else:
                            error_message = "Azure Response={}".format(
                                str(dict(str_line))
                            )
                        raise litellm.AzureOpenAIError(
                            status_code=400, message=error_message
                        )
@ -9683,6 +9986,39 @@ class CustomStreamWrapper:
                "finish_reason": finish_reason,
            }
    def handle_watsonx_stream(self, chunk):
        try:
            if isinstance(chunk, dict):
                parsed_response = chunk
            elif isinstance(chunk, (str, bytes)):
                if isinstance(chunk, bytes):
                    chunk = chunk.decode("utf-8")
                if "generated_text" in chunk:
                    response = chunk.replace("data: ", "").strip()
                    parsed_response = json.loads(response)
                else:
                    return {"text": "", "is_finished": False}
            else:
                print_verbose(f"chunk: {chunk} (Type: {type(chunk)})")
                raise ValueError(
                    f"Unable to parse response. Original response: {chunk}"
                )
            results = parsed_response.get("results", [])
            if len(results) > 0:
                text = results[0].get("generated_text", "")
                finish_reason = results[0].get("stop_reason")
                is_finished = finish_reason != "not_finished"
                return {
                    "text": text,
                    "is_finished": is_finished,
                    "finish_reason": finish_reason,
                    "prompt_tokens": results[0].get("input_token_count", None),
                    "completion_tokens": results[0].get("generated_token_count", None),
                }
            return {"text": "", "is_finished": False}
        except Exception as e:
            raise e
    def model_response_creator(self):
        model_response = ModelResponse(stream=True, model=self.model)
        if self.response_id is not None:
@ -9938,6 +10274,11 @@ class CustomStreamWrapper:
                print_verbose(f"completion obj content: {completion_obj['content']}")
                if response_obj["is_finished"]:
                    self.received_finish_reason = response_obj["finish_reason"]
            elif self.custom_llm_provider == "watsonx":
                response_obj = self.handle_watsonx_stream(chunk)
                completion_obj["content"] = response_obj["text"]
                if response_obj["is_finished"]:
                    self.received_finish_reason = response_obj["finish_reason"]
            elif self.custom_llm_provider == "text-completion-openai":
                response_obj = self.handle_openai_text_completion_chunk(chunk)
                completion_obj["content"] = response_obj["text"]
@ -10123,12 +10464,23 @@ class CustomStreamWrapper:
                        model_response.id = original_chunk.id
                        self.response_id = original_chunk.id
                        if len(original_chunk.choices) > 0:
-                            try:
+                            choices = []
-                                delta = dict(original_chunk.choices[0].delta)
+                            for idx, choice in enumerate(original_chunk.choices):
-                                print_verbose(f"original delta: {delta}")
+                                try:
-                                model_response.choices[0].delta = Delta(**delta)
+                                    if isinstance(choice, BaseModel):
-                            except Exception as e:
+                                        try:
-                                model_response.choices[0].delta = Delta()
+                                            choice_json = choice.model_dump()
                                        except Exception as e:
                                            choice_json = choice.dict()
                                        choice_json.pop(
                                            "finish_reason", None
                                        )  # for mistral etc. which return a value in their last chunk (not-openai compatible).
                                        print_verbose(f"choice_json: {choice_json}")
                                        choices.append(StreamingChoices(**choice_json))
                                except Exception as e:
                                    choices.append(StreamingChoices())
                            print_verbose(f"choices in streaming: {choices}")
                            model_response.choices = choices
                        else:
                            return
                        model_response.system_fingerprint = (
@ -10173,11 +10525,11 @@ class CustomStreamWrapper:
                        )
                    self.holding_chunk = ""
                # if delta is None
-                is_delta_empty = self.is_delta_empty(
+                _is_delta_empty = self.is_delta_empty(
                    delta=model_response.choices[0].delta
                )
-                if is_delta_empty:
+                if _is_delta_empty:
                    # get any function call arguments
                    model_response.choices[0].finish_reason = map_finish_reason(
                        finish_reason=self.received_finish_reason
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@ -1418,6 +1418,123 @@
        "litellm_provider": "replicate",
        "mode": "chat"
    },
    "replicate/meta/llama-2-13b": {
        "max_tokens": 4096,
        "max_input_tokens": 4096,
        "max_output_tokens": 4096,
        "input_cost_per_token": 0.0000001,
        "output_cost_per_token": 0.0000005,
        "litellm_provider": "replicate",
        "mode": "chat"
    },
    "replicate/meta/llama-2-13b-chat": {
        "max_tokens": 4096,
        "max_input_tokens": 4096,
        "max_output_tokens": 4096,
        "input_cost_per_token": 0.0000001,
        "output_cost_per_token": 0.0000005,
        "litellm_provider": "replicate",
        "mode": "chat"
    },
    "replicate/meta/llama-2-70b": {
        "max_tokens": 4096,
        "max_input_tokens": 4096,
        "max_output_tokens": 4096,
        "input_cost_per_token": 0.00000065,
        "output_cost_per_token": 0.00000275,
        "litellm_provider": "replicate",
        "mode": "chat"
    },
    "replicate/meta/llama-2-70b-chat": {
        "max_tokens": 4096,
        "max_input_tokens": 4096,
        "max_output_tokens": 4096,
        "input_cost_per_token": 0.00000065,
        "output_cost_per_token": 0.00000275,
        "litellm_provider": "replicate",
        "mode": "chat"
    },
    "replicate/meta/llama-2-7b": {
        "max_tokens": 4096,
        "max_input_tokens": 4096,
        "max_output_tokens": 4096,
        "input_cost_per_token": 0.00000005,
        "output_cost_per_token": 0.00000025,
        "litellm_provider": "replicate",
        "mode": "chat"
    },
    "replicate/meta/llama-2-7b-chat": {
        "max_tokens": 4096,
        "max_input_tokens": 4096,
        "max_output_tokens": 4096,
        "input_cost_per_token": 0.00000005,
        "output_cost_per_token": 0.00000025,
        "litellm_provider": "replicate",
        "mode": "chat"
    },
    "replicate/meta/llama-3-70b": {
        "max_tokens": 4096,
        "max_input_tokens": 4096,
        "max_output_tokens": 4096,
        "input_cost_per_token": 0.00000065,
        "output_cost_per_token": 0.00000275,
        "litellm_provider": "replicate",
        "mode": "chat"
    },
    "replicate/meta/llama-3-70b-instruct": {
        "max_tokens": 4096,
        "max_input_tokens": 4096,
        "max_output_tokens": 4096,
        "input_cost_per_token": 0.00000065,
        "output_cost_per_token": 0.00000275,
        "litellm_provider": "replicate",
        "mode": "chat"
    },
    "replicate/meta/llama-3-8b": {
        "max_tokens": 4096,
        "max_input_tokens": 4096,
        "max_output_tokens": 4096,
        "input_cost_per_token": 0.00000005,
        "output_cost_per_token": 0.00000025,
        "litellm_provider": "replicate",
        "mode": "chat"
    },
    "replicate/meta/llama-3-8b-instruct": {
        "max_tokens": 4096,
        "max_input_tokens": 4096,
        "max_output_tokens": 4096,
        "input_cost_per_token": 0.00000005,
        "output_cost_per_token": 0.00000025,
        "litellm_provider": "replicate",
        "mode": "chat"
    },
    "replicate/mistralai/mistral-7b-v0.1": {
        "max_tokens": 4096,
        "max_input_tokens": 4096,
        "max_output_tokens": 4096,
        "input_cost_per_token": 0.00000005,
        "output_cost_per_token": 0.00000025,
        "litellm_provider": "replicate",
        "mode": "chat"
    },
    "replicate/mistralai/mistral-7b-instruct-v0.2": {
        "max_tokens": 4096,
        "max_input_tokens": 4096,
        "max_output_tokens": 4096,
        "input_cost_per_token": 0.00000005,
        "output_cost_per_token": 0.00000025,
        "litellm_provider": "replicate",
        "mode": "chat"
    },
    "replicate/mistralai/mixtral-8x7b-instruct-v0.1": {
        "max_tokens": 4096,
        "max_input_tokens": 4096,
        "max_output_tokens": 4096,
        "input_cost_per_token": 0.0000003,
        "output_cost_per_token": 0.000001,
        "litellm_provider": "replicate",
        "mode": "chat"
    },
    "openrouter/openai/gpt-3.5-turbo": {
        "max_tokens": 4095,
        "input_cost_per_token": 0.0000015,
@ -1455,6 +1572,17 @@
        "litellm_provider": "openrouter",
        "mode": "chat"
    },
    "openrouter/anthropic/claude-3-opus": {
        "max_tokens": 4096,
        "max_input_tokens": 200000,
        "max_output_tokens": 4096,
        "input_cost_per_token": 0.000015,
        "output_cost_per_token": 0.000075,
        "litellm_provider": "openrouter",
        "mode": "chat",
        "supports_function_calling": true,
        "tool_use_system_prompt_tokens": 395
    },
    "openrouter/google/palm-2-chat-bison": {
        "max_tokens": 8000,
        "input_cost_per_token": 0.0000005,
@ -2379,6 +2507,24 @@
        "litellm_provider": "bedrock",
        "mode": "chat"
    },
    "meta.llama3-8b-instruct-v1:0": {
        "max_tokens": 8192, 
        "max_input_tokens": 8192, 
        "max_output_tokens": 8192, 
        "input_cost_per_token": 0.0000004,
        "output_cost_per_token": 0.0000006,
        "litellm_provider": "bedrock",
        "mode": "chat"
    },
    "meta.llama3-70b-instruct-v1:0": {
        "max_tokens": 8192, 
        "max_input_tokens": 8192, 
        "max_output_tokens": 8192, 
        "input_cost_per_token": 0.00000265,
        "output_cost_per_token": 0.0000035,
        "litellm_provider": "bedrock",
        "mode": "chat"
    },
    "512-x-512/50-steps/stability.stable-diffusion-xl-v0": {
        "max_tokens": 77, 
        "max_input_tokens": 77, 
--- a/proxy_server_config.yaml
+++ b/proxy_server_config.yaml
@ -61,14 +61,14 @@ model_list:
      api_key: my-fake-key
      api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/
      stream_timeout: 0.001
-      rpm: 10
+      rpm: 100
  - model_name: fake-openai-endpoint-3
    litellm_params:
      model: openai/my-fake-model-2
      api_key: my-fake-key
      api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/
      stream_timeout: 0.001
-      rpm: 10
+      rpm: 100
  - model_name: "*"
    litellm_params:
      model: openai/*
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "1.35.27"
+version = "1.35.36"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT"
@ -80,7 +80,7 @@ requires = ["poetry-core", "wheel"]
 build-backend = "poetry.core.masonry.api"
 [tool.commitizen]
-version = "1.35.27"
+version = "1.35.36"
 version_files = [
    "pyproject.toml:^version"
 ]
--- a/Show more
+++ b/Show more
		`@ -0,0 +1 @@`
							`(self.webpackChunk_N_E=self.webpackChunk_N_E\|\|[]).push([[185],{93553:function(n,e,t){Promise.resolve().then(t.t.bind(t,63385,23)),Promise.resolve().then(t.t.bind(t,99646,23))},63385:function(){},99646:function(n){n.exports={style:{fontFamily:"'__Inter_12bbc4', '__Inter_Fallback_12bbc4'",fontStyle:"normal"},className:"__className_12bbc4"}}},function(n){n.O(0,[971,69,744],function(){return n(n.s=93553)}),_N_E=n.O()}]);`
		`@ -1 +0,0 @@`
			`(self.webpackChunk_N_E=self.webpackChunk_N_E\|\|[]).push([[185],{87421:function(n,e,t){Promise.resolve().then(t.t.bind(t,99646,23)),Promise.resolve().then(t.t.bind(t,63385,23))},63385:function(){},99646:function(n){n.exports={style:{fontFamily:"'__Inter_c23dc8', '__Inter_Fallback_c23dc8'",fontStyle:"normal"},className:"__className_c23dc8"}}},function(n){n.O(0,[971,69,744],function(){return n(n.s=87421)}),_N_E=n.O()}]);`
		`@ -1 +1 @@`
			`(self.webpackChunk_N_E=self.webpackChunk_N_E\|\|[]).push([[744],{32028:function(e,n,t){Promise.resolve().then(t.t.bind(t,47690,23)),Promise.resolve().then(t.t.bind(t,48955,23)),Promise.resolve().then(t.t.bind(t,5613,23)),Promise.resolve().then(t.t.bind(t,11902,23)),Promise.resolve().then(t.t.bind(t,31778,23)),Promise.resolve().then(t.t.bind(t,77831,23))}},function(e){var n=function(n){return e(e.s=n)};e.O(0,[971,69],function(){return n(35317),n(32028)}),_N_E=e.O()}]);`				`(self.webpackChunk_N_E=self.webpackChunk_N_E\|\|[]).push([[744],{70377:function(e,n,t){Promise.resolve().then(t.t.bind(t,47690,23)),Promise.resolve().then(t.t.bind(t,48955,23)),Promise.resolve().then(t.t.bind(t,5613,23)),Promise.resolve().then(t.t.bind(t,11902,23)),Promise.resolve().then(t.t.bind(t,31778,23)),Promise.resolve().then(t.t.bind(t,77831,23))}},function(e){var n=function(n){return e(e.s=n)};e.O(0,[971,69],function(){return n(35317),n(70377)}),_N_E=e.O()}]);`
		`@ -1 +1 @@`
			!function(){"use strict";var e,t,n,r,o,u,i,c,f,a={},l={};function d(e){var t=l[e];if(void 0!==t)return t.exports;var n=l[e]={id:e,loaded:!1,exports:{}},r=!0;try{a[e](n,n.exports,d),r=!1}finally{r&&delete l[e]}return n.loaded=!0,n.exports}d.m=a,e=[],d.O=function(t,n,r,o){if(n){o=o\|\|0;for(var u=e.length;u>0&&e[u-1][2]>o;u--)e[u]=e[u-1];e[u]=[n,r,o];return}for(var i=1/0,u=0;u<e.length;u++){for(var n=e[u][0],r=e[u][1],o=e[u][2],c=!0,f=0;f<n.length;f++)i>=o&&Object.keys(d.O).every(function(e){return d.O[e](n[f])})?n.splice(f--,1):(c=!1,o<i&&(i=o));if(c){e.splice(u--,1);var a=r();void 0!==a&&(t=a)}}return t},d.n=function(e){var t=e&&e.__esModule?function(){return e.default}:function(){return e};return d.d(t,{a:t}),t},n=Object.getPrototypeOf?function(e){return Object.getPrototypeOf(e)}:function(e){return e.__proto__},d.t=function(e,r){if(1&r&&(e=this(e)),8&r\|\|"object"==typeof e&&e&&(4&r&&e.__esModule\|\|16&r&&"function"==typeof e.then))return e;var o=Object.create(null);d.r(o);var u={};t=t\|\|[null,n({}),n([]),n(n)];for(var i=2&r&&e;"object"==typeof i&&!~t.indexOf(i);i=n(i))Object.getOwnPropertyNames(i).forEach(function(t){u[t]=function(){return e[t]}});return u.default=function(){return e},d.d(o,u),o},d.d=function(e,t){for(var n in t)d.o(t,n)&&!d.o(e,n)&&Object.defineProperty(e,n,{enumerable:!0,get:t[n]})},d.f={},d.e=function(e){return Promise.all(Object.keys(d.f).reduce(function(t,n){return d.f[n](e,t),t},[]))},d.u=function(e){},d.miniCssF=function(e){return"static/css/60d9f441227ccc7e.css"},d.g=function(){if("object"==typeof globalThis)return globalThis;try{return this\|\|Function("return this")()}catch(e){if("object"==typeof window)return window}}(),d.o=function(e,t){return Object.prototype.hasOwnProperty.call(e,t)},r={},o="_N_E:",d.l=function(e,t,n,u){if(r[e]){r[e].push(t);return}if(void 0!==n)for(var i,c,f=document.getElementsByTagName("script"),a=0;a<f.length;a++){var l=f[a];if(l.getAttribute("src")==e\|\|l.getAttribute("data-webpack")==o+n){i=l;break}}i\|\|(c=!0,(i=document.createElement("script")).charset="utf-8",i.timeout=120,d.nc&&i.setAttribute("nonce",d.nc),i.setAttribute("data-webpack",o+n),i.src=d.tu(e)),r[e]=[t];var s=function(t,n){i.onerror=i.onload=null,clearTimeout(p);var o=r[e];if(delete r[e],i.parentNode&&i.parentNode.removeChild(i),o&&o.forEach(function(e){return e(n)}),t)return t(n)},p=setTimeout(s.bind(null,void 0,{type:"timeout",target:i}),12e4);i.onerror=s.bind(null,i.onerror),i.onload=s.bind(null,i.onload),c&&document.head.appendChild(i)},d.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},d.nmd=function(e){return e.paths=[],e.children\|\|(e.children=[]),e},d.tt=function(){return void 0===u&&(u={createScriptURL:function(e){return e}},"undefined"!=typeof trustedTypes&&trustedTypes.createPolicy&&(u=trustedTypes.createPolicy("nextjs#bundler",u))),u},d.tu=function(e){return d.tt().createScriptURL(e)},d.p="/ui/_next/",i={272:0},d.f.j=function(e,t){var n=d.o(i,e)?i[e]:void 0;if(0!==n){if(n)t.push(n[2]);else if(272!=e){var r=new Promise(function(t,r){n=i[e]=[t,r]});t.push(n[2]=r);var o=d.p+d.u(e),u=Error();d.l(o,function(t){if(d.o(i,e)&&(0!==(n=i[e])&&(i[e]=void 0),n)){var r=t&&("load"===t.type?"missing":t.type),o=t&&t.target&&t.target.src;u.message="Loading chunk "+e+" failed.\n("+r+": "+o+")",u.name="ChunkLoadError",u.type=r,u.request=o,n[1](u)}},"chunk-"+e,e)}else i[e]=0}},d.O.j=function(e){return 0===i[e]},c=function(e,t){var n,r,o=t[0],u=t[1],c=t[2],f=0;if(o.some(function(e){return 0!==i[e]})){for(n in u)d.o(u,n)&&(d.m[n]=u[n]);if(c)var a=c(d)}for(e&&e(t);f<o.length;f++)r=o[f],d.o(i,r)&&i[r]&&i[r][0](),i[r]=0;return d.O(a)},(f=self.webpackChunk_N_E=self.webpackChunk_N_E\|\|[]).forEach(c.bind(null,0)),f.push=c.bind(null,f.push.bind(f))}();				!function(){"use strict";var e,t,n,r,o,u,i,c,f,a={},l={};function d(e){var t=l[e];if(void 0!==t)return t.exports;var n=l[e]={id:e,loaded:!1,exports:{}},r=!0;try{a[e](n,n.exports,d),r=!1}finally{r&&delete l[e]}return n.loaded=!0,n.exports}d.m=a,e=[],d.O=function(t,n,r,o){if(n){o=o\|\|0;for(var u=e.length;u>0&&e[u-1][2]>o;u--)e[u]=e[u-1];e[u]=[n,r,o];return}for(var i=1/0,u=0;u<e.length;u++){for(var n=e[u][0],r=e[u][1],o=e[u][2],c=!0,f=0;f<n.length;f++)i>=o&&Object.keys(d.O).every(function(e){return d.O[e](n[f])})?n.splice(f--,1):(c=!1,o<i&&(i=o));if(c){e.splice(u--,1);var a=r();void 0!==a&&(t=a)}}return t},d.n=function(e){var t=e&&e.__esModule?function(){return e.default}:function(){return e};return d.d(t,{a:t}),t},n=Object.getPrototypeOf?function(e){return Object.getPrototypeOf(e)}:function(e){return e.__proto__},d.t=function(e,r){if(1&r&&(e=this(e)),8&r\|\|"object"==typeof e&&e&&(4&r&&e.__esModule\|\|16&r&&"function"==typeof e.then))return e;var o=Object.create(null);d.r(o);var u={};t=t\|\|[null,n({}),n([]),n(n)];for(var i=2&r&&e;"object"==typeof i&&!~t.indexOf(i);i=n(i))Object.getOwnPropertyNames(i).forEach(function(t){u[t]=function(){return e[t]}});return u.default=function(){return e},d.d(o,u),o},d.d=function(e,t){for(var n in t)d.o(t,n)&&!d.o(e,n)&&Object.defineProperty(e,n,{enumerable:!0,get:t[n]})},d.f={},d.e=function(e){return Promise.all(Object.keys(d.f).reduce(function(t,n){return d.f[n](e,t),t},[]))},d.u=function(e){},d.miniCssF=function(e){return"static/css/9f51f0573c6b0365.css"},d.g=function(){if("object"==typeof globalThis)return globalThis;try{return this\|\|Function("return this")()}catch(e){if("object"==typeof window)return window}}(),d.o=function(e,t){return Object.prototype.hasOwnProperty.call(e,t)},r={},o="_N_E:",d.l=function(e,t,n,u){if(r[e]){r[e].push(t);return}if(void 0!==n)for(var i,c,f=document.getElementsByTagName("script"),a=0;a<f.length;a++){var l=f[a];if(l.getAttribute("src")==e\|\|l.getAttribute("data-webpack")==o+n){i=l;break}}i\|\|(c=!0,(i=document.createElement("script")).charset="utf-8",i.timeout=120,d.nc&&i.setAttribute("nonce",d.nc),i.setAttribute("data-webpack",o+n),i.src=d.tu(e)),r[e]=[t];var s=function(t,n){i.onerror=i.onload=null,clearTimeout(p);var o=r[e];if(delete r[e],i.parentNode&&i.parentNode.removeChild(i),o&&o.forEach(function(e){return e(n)}),t)return t(n)},p=setTimeout(s.bind(null,void 0,{type:"timeout",target:i}),12e4);i.onerror=s.bind(null,i.onerror),i.onload=s.bind(null,i.onload),c&&document.head.appendChild(i)},d.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},d.nmd=function(e){return e.paths=[],e.children\|\|(e.children=[]),e},d.tt=function(){return void 0===u&&(u={createScriptURL:function(e){return e}},"undefined"!=typeof trustedTypes&&trustedTypes.createPolicy&&(u=trustedTypes.createPolicy("nextjs#bundler",u))),u},d.tu=function(e){return d.tt().createScriptURL(e)},d.p="/ui/_next/",i={272:0},d.f.j=function(e,t){var n=d.o(i,e)?i[e]:void 0;if(0!==n){if(n)t.push(n[2]);else if(272!=e){var r=new Promise(function(t,r){n=i[e]=[t,r]});t.push(n[2]=r);var o=d.p+d.u(e),u=Error();d.l(o,function(t){if(d.o(i,e)&&(0!==(n=i[e])&&(i[e]=void 0),n)){var r=t&&("load"===t.type?"missing":t.type),o=t&&t.target&&t.target.src;u.message="Loading chunk "+e+" failed.\n("+r+": "+o+")",u.name="ChunkLoadError",u.type=r,u.request=o,n[1](u)}},"chunk-"+e,e)}else i[e]=0}},d.O.j=function(e){return 0===i[e]},c=function(e,t){var n,r,o=t[0],u=t[1],c=t[2],f=0;if(o.some(function(e){return 0!==i[e]})){for(n in u)d.o(u,n)&&(d.m[n]=u[n]);if(c)var a=c(d)}for(e&&e(t);f<o.length;f++)r=o[f],d.o(i,r)&&i[r]&&i[r][0](),i[r]=0;return d.O(a)},(f=self.webpackChunk_N_E=self.webpackChunk_N_E\|\|[]).forEach(c.bind(null,0)),f.push=c.bind(null,f.push.bind(f))}();