Merge branch 'main' into litellm_load_balancing_transcription_endpoints

2024-03-08 23:08:47 -08:00 · 2024-03-08 23:08:47 -08:00 · caa99f43bf
commit caa99f43bf
parent 2e66267128 e245b1c98a
22 changed files with 704 additions and 233 deletions
--- a/docs/my-website/docs/load_test.md
+++ b/docs/my-website/docs/load_test.md
@ -1,5 +1,84 @@
 import Image from '@theme/IdealImage';
 # 🔥 Load Test LiteLLM 
 ## Load Test LiteLLM Proxy - 1500+ req/s
 ## 1500+ concurrent requests/s
 LiteLLM proxy has been load tested to handle 1500+ concurrent req/s
 ```python
 import time, asyncio
 from openai import AsyncOpenAI, AsyncAzureOpenAI
 import uuid
 import traceback
 # base_url - litellm proxy endpoint
 # api_key - litellm proxy api-key, is created proxy with auth
 litellm_client = AsyncOpenAI(base_url="http://0.0.0.0:4000", api_key="sk-1234")
 async def litellm_completion():
    # Your existing code for litellm_completion goes here
    try:
        response = await litellm_client.chat.completions.create(
            model="azure-gpt-3.5",
            messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
        )
        print(response)
        return response
    except Exception as e:
        # If there's an exception, log the error message
        with open("error_log.txt", "a") as error_log:
            error_log.write(f"Error during completion: {str(e)}\n")
        pass
 async def main():
    for i in range(1):
        start = time.time()
        n = 1500  # Number of concurrent tasks
        tasks = [litellm_completion() for _ in range(n)]
        chat_completions = await asyncio.gather(*tasks)
        successful_completions = [c for c in chat_completions if c is not None]
        # Write errors to error_log.txt
        with open("error_log.txt", "a") as error_log:
            for completion in chat_completions:
                if isinstance(completion, str):
                    error_log.write(completion + "\n")
        print(n, time.time() - start, len(successful_completions))
        time.sleep(10)
 if __name__ == "__main__":
    # Blank out contents of error_log.txt
    open("error_log.txt", "w").close()
    asyncio.run(main())
 ```
 ### Throughput - 30% Increase
 LiteLLM proxy + Load Balancer gives **30% increase** in throughput compared to Raw OpenAI API
 <Image img={require('../img/throughput.png')} />
 ### Latency Added - 0.00325 seconds
 LiteLLM proxy adds **0.00325 seconds** latency as compared to using the Raw OpenAI API
 <Image img={require('../img/latency.png')} />
 ### Testing LiteLLM Proxy with Locust 
 - 1 LiteLLM container can handle ~140 requests/second with 0.4 failures
 <Image img={require('../img/locust.png')} />
 ## Load Test LiteLLM SDK vs OpenAI
 Here is a script to load test LiteLLM vs OpenAI 
 ```python
@ -85,3 +164,4 @@ async def loadtest_fn():
 asyncio.run(loadtest_fn())
 ```
--- a/docs/my-website/docs/proxy/configs.md
+++ b/docs/my-website/docs/proxy/configs.md
@ -49,9 +49,9 @@ model_list:
      rpm: 6
  - model_name: anthropic-claude
    litellm_params: 
-      model="bedrock/anthropic.claude-instant-v1"
+      model: bedrock/anthropic.claude-instant-v1
      ### [OPTIONAL] SET AWS REGION ###
-      aws_region_name="us-east-1"
+      aws_region_name: us-east-1
  - model_name: vllm-models
    litellm_params:
      model: openai/facebook/opt-125m # the `openai/` prefix tells litellm it's openai compatible
--- a/docs/my-website/docs/proxy/deploy.md
+++ b/docs/my-website/docs/proxy/deploy.md
@ -68,6 +68,72 @@ CMD ["--port", "4000", "--config", "config.yaml", "--detailed_debug", "--run_gun
 </TabItem>
 <TabItem value="kubernetes" label="Kubernetes">
 Deploying a config file based litellm instance just requires a simple deployment that loads
 the config.yaml file via a config map. Also it would be a good practice to use the env var
 declaration for api keys, and attach the env vars with the api key values as an opaque secret.
 ```yaml
 apiVersion: v1
 kind: ConfigMap
 metadata:
  name: litellm-config-file
 data:
  config.yaml: |
      model_list: 
        - model_name: gpt-3.5-turbo
          litellm_params:
            model: azure/gpt-turbo-small-ca
            api_base: https://my-endpoint-canada-berri992.openai.azure.com/
            api_key: os.environ/CA_AZURE_OPENAI_API_KEY
 ---
 apiVersion: v1
 kind: Secret
 type: Opaque
 metadata:
  name: litellm-secrets
 data:
  CA_AZURE_OPENAI_API_KEY: bWVvd19pbV9hX2NhdA== # your api key in base64
 ---
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: litellm-deployment
  labels:
    app: litellm
 spec:
  selector:
    matchLabels:
      app: litellm
  template:
    metadata:
      labels:
        app: litellm
    spec:
      containers:
      - name: litellm
        image: ghcr.io/berriai/litellm:main-latest # it is recommended to fix a version generally
        ports:
        - containerPort: 4000
        volumeMounts:
        - name: config-volume
          mountPath: /app/proxy_server_config.yaml
          subPath: config.yaml
        envFrom:
        - secretRef:
            name: litellm-secrets
      volumes:
        - name: config-volume
          configMap:
            name: litellm-config-file
 ```
 > [!TIP]
 > To avoid issues with predictability, difficulties in rollback, and inconsistent environments, use versioning or SHA digests (for example, `litellm:main-v1.30.3` or `litellm@sha256:12345abcdef...`) instead of `litellm:main-latest`.
 </TabItem>
 </Tabs>
 ## Deploy with Database
@ -350,17 +416,3 @@ Run the command `docker-compose up` or `docker compose up` as per your docker in
 Your LiteLLM container should be running now on the defined port e.g. `8000`.
 ## LiteLLM Proxy Performance
 LiteLLM proxy has been load tested to handle 1500 req/s.
 ### Throughput - 30% Increase
 LiteLLM proxy + Load Balancer gives **30% increase** in throughput compared to Raw OpenAI API
 <Image img={require('../../img/throughput.png')} />
 ### Latency Added - 0.00325 seconds
 LiteLLM proxy adds **0.00325 seconds** latency as compared to using the Raw OpenAI API
 <Image img={require('../../img/latency.png')} />
--- a/docs/my-website/docs/proxy/enterprise.md
+++ b/docs/my-website/docs/proxy/enterprise.md
@ -12,14 +12,16 @@ Features here are behind a commercial license in our `/enterprise` folder. [**Se
 :::
 Features: 
- [ ] Content Moderation with LlamaGuard 
+- ✅ Content Moderation with LlamaGuard 
- [ ] Content Moderation with Google Text Moderations 
+- ✅ Content Moderation with Google Text Moderations 
- [ ] Content Moderation with LLM Guard
+- ✅ Content Moderation with LLM Guard
- [ ] Reject calls from Blocked User list 
+- ✅ Reject calls from Blocked User list 
- [ ] Reject calls (incoming / outgoing) with Banned Keywords (e.g. competitors)
+- ✅ Reject calls (incoming / outgoing) with Banned Keywords (e.g. competitors)
- [ ] Tracking Spend for Custom Tags
+- ✅ Don't log/store specific requests (eg confidential LLM requests)
 - ✅ Tracking Spend for Custom Tags
-## Content Moderation with LlamaGuard 
+## Content Moderation
 ### Content Moderation with LlamaGuard 
 Currently works with Sagemaker's LlamaGuard endpoint. 
@ -39,7 +41,7 @@ os.environ["AWS_SECRET_ACCESS_KEY"] = ""
 os.environ["AWS_REGION_NAME"] = ""
 ```
-### Customize LlamaGuard prompt 
+#### Customize LlamaGuard prompt 
 To modify the unsafe categories llama guard evaluates against, just create your own version of [this category list](https://github.com/BerriAI/litellm/blob/main/litellm/proxy/llamaguard_prompt.txt)
@ -51,7 +53,7 @@ callbacks: ["llamaguard_moderations"]
  llamaguard_unsafe_content_categories: /path/to/llamaguard_prompt.txt
 ```
-## Content Moderation with LLM Guard
+### Content Moderation with LLM Guard
 Set the LLM Guard API Base in your environment 
@ -78,7 +80,7 @@ Expected results:
 LLM Guard: Received response - {"sanitized_prompt": "hello world", "is_valid": true, "scanners": { "Regex": 0.0 }}
 ```
-## Content Moderation with Google Text Moderation 
+### Content Moderation with Google Text Moderation 
 Requires your GOOGLE_APPLICATION_CREDENTIALS to be set in your .env (same as VertexAI).
@ -89,7 +91,7 @@ litellm_settings:
   callbacks: ["google_text_moderation"]
 ```
-### Set custom confidence thresholds
+#### Set custom confidence thresholds
 Google Moderations checks the test against several categories. [Source](https://cloud.google.com/natural-language/docs/moderating-text#safety_attribute_confidence_scores)
@ -133,6 +135,33 @@ Here are the category specific values:
 | "legal" | legal_threshold: 0.1 |
 ## Incognito Requests - Don't log anything
 When `no-log=True`, the request will **not be logged on any callbacks** and there will be **no server logs on litellm**
 ```python
 import openai
 client = openai.OpenAI(
    api_key="anything",            # proxy api-key
    base_url="http://0.0.0.0:8000" # litellm proxy 
 )
 response = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages = [
        {
            "role": "user",
            "content": "this is a test request, write a short poem"
        }
    ],
    extra_body={
        "no-log": True
    }
 )
 print(response)
 ```
 ## Enable Blocked User Lists 
 If any call is made to proxy with this user id, it'll be rejected - use this if you want to let users opt-out of ai features 
--- a/docs/my-website/img/locust.png
+++ b/docs/my-website/img/locust.png
--- a/litellm/init.py
+++ b/litellm/init.py
@ -570,7 +570,7 @@ from .utils import (
    _calculate_retry_after,
    _should_retry,
    get_secret,
-    get_mapped_model_params,
+    get_supported_openai_params,
 )
 from .llms.huggingface_restapi import HuggingfaceConfig
 from .llms.anthropic import AnthropicConfig
--- a/litellm/_logging.py
+++ b/litellm/_logging.py
@ -31,6 +31,18 @@ def _turn_on_debug():
    verbose_proxy_logger.setLevel(level=logging.DEBUG)  # set proxy logs to debug
 def _disable_debugging():
    verbose_logger.disabled = True
    verbose_router_logger.disabled = True
    verbose_proxy_logger.disabled = True
 def _enable_debugging():
    verbose_logger.disabled = False
    verbose_router_logger.disabled = False
    verbose_proxy_logger.disabled = False
 def print_verbose(print_statement):
    try:
        if set_verbose:
--- a/litellm/llms/azure.py
+++ b/litellm/llms/azure.py
@ -15,6 +15,7 @@ import litellm, json
 import httpx
 from .custom_httpx.azure_dall_e_2 import CustomHTTPTransport, AsyncCustomHTTPTransport
 from openai import AzureOpenAI, AsyncAzureOpenAI
 import uuid
 class AzureOpenAIError(Exception):
@ -271,6 +272,14 @@ class AzureChatCompletion(BaseLLM):
                    azure_client = AzureOpenAI(**azure_client_params)
                else:
                    azure_client = client
                    if api_version is not None and isinstance(
                        azure_client._custom_query, dict
                    ):
                        # set api_version to version passed by user
                        azure_client._custom_query.setdefault(
                            "api-version", api_version
                        )
                response = azure_client.chat.completions.create(**data, timeout=timeout)  # type: ignore
                stringified_response = response.model_dump()
                ## LOGGING
@ -334,10 +343,17 @@ class AzureChatCompletion(BaseLLM):
                azure_client_params["api_key"] = api_key
            elif azure_ad_token is not None:
                azure_client_params["azure_ad_token"] = azure_ad_token
            # setting Azure client
            if client is None:
                azure_client = AsyncAzureOpenAI(**azure_client_params)
            else:
                azure_client = client
                if api_version is not None and isinstance(
                    azure_client._custom_query, dict
                ):
                    # set api_version to version passed by user
                    azure_client._custom_query.setdefault("api-version", api_version)
            ## LOGGING
            logging_obj.pre_call(
                input=data["messages"],
@ -402,6 +418,9 @@ class AzureChatCompletion(BaseLLM):
            azure_client = AzureOpenAI(**azure_client_params)
        else:
            azure_client = client
            if api_version is not None and isinstance(azure_client._custom_query, dict):
                # set api_version to version passed by user
                azure_client._custom_query.setdefault("api-version", api_version)
        ## LOGGING
        logging_obj.pre_call(
            input=data["messages"],
@ -455,6 +474,11 @@ class AzureChatCompletion(BaseLLM):
                azure_client = AsyncAzureOpenAI(**azure_client_params)
            else:
                azure_client = client
                if api_version is not None and isinstance(
                    azure_client._custom_query, dict
                ):
                    # set api_version to version passed by user
                    azure_client._custom_query.setdefault("api-version", api_version)
            ## LOGGING
            logging_obj.pre_call(
                input=data["messages"],
@ -813,6 +837,19 @@ class AzureChatCompletion(BaseLLM):
            azure_client = AzureOpenAI(http_client=litellm.client_session, **azure_client_params)  # type: ignore
        else:
            azure_client = client
        ## LOGGING
        logging_obj.pre_call(
            input=f"audio_file_{uuid.uuid4()}",
            api_key=azure_client.api_key,
            additional_args={
                "headers": {"Authorization": f"Bearer {azure_client.api_key}"},
                "api_base": azure_client._base_url._uri_reference,
                "atranscription": True,
                "complete_input_dict": data,
            },
        )
        response = azure_client.audio.transcriptions.create(
            **data, timeout=timeout  # type: ignore
        )
@ -850,6 +887,20 @@ class AzureChatCompletion(BaseLLM):
            else:
                async_azure_client = client
            ## LOGGING
            logging_obj.pre_call(
                input=f"audio_file_{uuid.uuid4()}",
                api_key=async_azure_client.api_key,
                additional_args={
                    "headers": {
                        "Authorization": f"Bearer {async_azure_client.api_key}"
                    },
                    "api_base": async_azure_client._base_url._uri_reference,
                    "atranscription": True,
                    "complete_input_dict": data,
                },
            )
            response = await async_azure_client.audio.transcriptions.create(
                **data, timeout=timeout
            )  # type: ignore
--- a/litellm/llms/openai.py
+++ b/litellm/llms/openai.py
@ -238,14 +238,22 @@ class OpenAIChatCompletion(BaseLLM):
                    status_code=422, message=f"Timeout needs to be a float"
                )
-            if custom_llm_provider == "mistral":
+            if custom_llm_provider != "openai":
-                # check if message content passed in as list, and not string
+                # process all OpenAI compatible provider logic here
-                messages = prompt_factory(
+                if custom_llm_provider == "mistral":
-                    model=model,
+                    # check if message content passed in as list, and not string
-                    messages=messages,
+                    messages = prompt_factory(
-                    custom_llm_provider=custom_llm_provider,
+                        model=model,
-                )
+                        messages=messages,
-
+                        custom_llm_provider=custom_llm_provider,
                    )
                if custom_llm_provider == "perplexity" and messages is not None:
                    # check if messages.name is passed + supported, if not supported remove
                    messages = prompt_factory(
                        model=model,
                        messages=messages,
                        custom_llm_provider=custom_llm_provider,
                    )
            for _ in range(
                2
            ):  # if call fails due to alternating messages, retry with reformatted message
--- a/litellm/llms/prompt_templates/factory.py
+++ b/litellm/llms/prompt_templates/factory.py
@ -556,6 +556,7 @@ def anthropic_messages_pt(messages: list):
    3. Each message must alternate between "user" and "assistant" (this is not addressed as now by litellm)
    4. final assistant content cannot end with trailing whitespace (anthropic raises an error otherwise)
    5. System messages are a separate param to the Messages API (used for tool calling)
    6. Ensure we only accept role, content. (message.name is not supported)
    """
    ## Ensure final assistant message has no trailing whitespace
    last_assistant_message_idx: Optional[int] = None
@ -583,7 +584,9 @@ def anthropic_messages_pt(messages: list):
                    new_content.append({"type": "text", "text": m["text"]})
            new_messages.append({"role": messages[0]["role"], "content": new_content})  # type: ignore
        else:
-            new_messages.append(messages[0])
+            new_messages.append(
                {"role": messages[0]["role"], "content": messages[0]["content"]}
            )
        return new_messages
@ -606,7 +609,9 @@ def anthropic_messages_pt(messages: list):
                    new_content.append({"type": "text", "content": m["text"]})
            new_messages.append({"role": messages[i]["role"], "content": new_content})  # type: ignore
        else:
-            new_messages.append(messages[i])
+            new_messages.append(
                {"role": messages[i]["role"], "content": messages[i]["content"]}
            )
        if messages[i]["role"] == messages[i + 1]["role"]:
            if messages[i]["role"] == "user":
@ -897,6 +902,10 @@ def prompt_factory(
                return anthropic_pt(messages=messages)
        elif "mistral." in model:
            return mistral_instruct_pt(messages=messages)
    elif custom_llm_provider == "perplexity":
        for message in messages:
            message.pop("name", None)
        return messages
    try:
        if "meta-llama/llama-2" in model and "chat" in model:
            return llama_2_chat_pt(messages=messages)
--- a/litellm/main.py
+++ b/litellm/main.py
@ -488,6 +488,8 @@ def completion(
    ### ASYNC CALLS ###
    acompletion = kwargs.get("acompletion", False)
    client = kwargs.get("client", None)
    ### Admin Controls ###
    no_log = kwargs.get("no-log", False)
    ######## end of unpacking kwargs ###########
    openai_params = [
        "functions",
@ -564,6 +566,7 @@ def completion(
        "caching_groups",
        "ttl",
        "cache",
        "no-log",
    ]
    default_params = openai_params + litellm_params
    non_default_params = {
@ -727,6 +730,7 @@ def completion(
            model_info=model_info,
            proxy_server_request=proxy_server_request,
            preset_cache_key=preset_cache_key,
            no_log=no_log,
        )
        logging.update_environment_variables(
            model=model,
@ -2418,6 +2422,7 @@ def embedding(
        "caching_groups",
        "ttl",
        "cache",
        "no-log",
    ]
    default_params = openai_params + litellm_params
    non_default_params = {
--- a/litellm/proxy/proxy_cli.py
+++ b/litellm/proxy/proxy_cli.py
@ -16,6 +16,13 @@ from importlib import resources
 import shutil
 telemetry = None
 default_num_workers = 1
 try:
    default_num_workers = os.cpu_count() or 1
    if default_num_workers is not None and default_num_workers > 0:
        default_num_workers -= 1
 except:
    pass
 def append_query_params(url, params):
@ -57,7 +64,7 @@ def is_port_in_use(port):
@click.option("--port", default=8000, help="Port to bind the server to.", envvar="PORT")
@click.option(
    "--num_workers",
-    default=1,
+    default=default_num_workers,
    help="Number of gunicorn workers to spin up",
    envvar="NUM_WORKERS",
 )
--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@ -5,12 +5,9 @@ model_list:
      api_base: os.environ/AZURE_API_BASE
      api_key: os.environ/AZURE_API_KEY
      api_version: "2023-07-01-preview"
-  - model_name: azure-gpt-3.5
+litellm_settings:
-    litellm_params:
+  set_verbose: True
-      model: gpt-3.5-turbo
+  success_callback: ["langfuse"]
      api_key: os.environ/OPENAI_API_KEY
    model_info:
      access_groups: ["public"]
 router_settings:
  set_verbose: True
  debug_level: "DEBUG"
--- a/litellm/proxy/proxy_load_test/litellm_proxy_config.yaml
+++ b/litellm/proxy/proxy_load_test/litellm_proxy_config.yaml
@ -0,0 +1,6 @@
 model_list:
  - model_name: gpt-3.5-turbo
    litellm_params:
      model: openai/my-fake-model
      api_key: my-fake-key
      api_base: http://0.0.0.0:8090
--- a/litellm/proxy/proxy_load_test/locustfile.py
+++ b/litellm/proxy/proxy_load_test/locustfile.py
@ -0,0 +1,27 @@
 from locust import HttpUser, task, between
 class MyUser(HttpUser):
    wait_time = between(1, 5)
    @task
    def chat_completion(self):
        headers = {
            "Content-Type": "application/json",
            # Include any additional headers you may need for authentication, etc.
        }
        # Customize the payload with "model" and "messages" keys
        payload = {
            "model": "gpt-3.5-turbo",
            "messages": [
                {"role": "system", "content": "You are a chat bot."},
                {"role": "user", "content": "Hello, how are you?"},
            ],
            # Add more data as necessary
        }
        # Make a POST request to the "chat/completions" endpoint
        response = self.client.post("chat/completions", json=payload, headers=headers)
        # Print or log the response if needed
--- a/litellm/proxy/proxy_load_test/openai_endpoint.py
+++ b/litellm/proxy/proxy_load_test/openai_endpoint.py
@ -0,0 +1,50 @@
 # import sys, os
 # sys.path.insert(
 #     0, os.path.abspath("../")
 # )  # Adds the parent directory to the system path
 from fastapi import FastAPI, Request, status, HTTPException, Depends
 from fastapi.responses import StreamingResponse
 from fastapi.security import OAuth2PasswordBearer
 from fastapi.middleware.cors import CORSMiddleware
 app = FastAPI()
 app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
 )
 # for completion
@app.post("/chat/completions")
@app.post("/v1/chat/completions")
 async def completion(request: Request):
    return {
        "id": "chatcmpl-123",
        "object": "chat.completion",
        "created": 1677652288,
        "model": "gpt-3.5-turbo-0125",
        "system_fingerprint": "fp_44709d6fcb",
        "choices": [
            {
                "index": 0,
                "message": {
                    "role": "assistant",
                    "content": "\n\nHello there, how may I assist you today?",
                },
                "logprobs": None,
                "finish_reason": "stop",
            }
        ],
        "usage": {"prompt_tokens": 9, "completion_tokens": 12, "total_tokens": 21},
    }
 if __name__ == "__main__":
    import uvicorn
    # run this on 8090, 8091, 8092 and 8093
    uvicorn.run(app, host="0.0.0.0", port=8090)
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@ -1677,9 +1677,9 @@ class ProxyConfig:
                        # these are litellm callbacks - "langfuse", "sentry", "wandb"
                        else:
                            litellm.success_callback.append(callback)
-                    verbose_proxy_logger.debug(
+                    print(  # noqa
                        f"{blue_color_code} Initialized Success Callbacks - {litellm.success_callback} {reset_color_code}"
-                    )
+                    )  # noqa
                elif key == "failure_callback":
                    litellm.failure_callback = []
@ -2672,6 +2672,11 @@ async def chat_completion(
        except:
            data = json.loads(body_str)
        # Azure OpenAI only: check if user passed api-version
        query_params = dict(request.query_params)
        if "api-version" in query_params:
            data["api_version"] = query_params["api-version"]
        # Include original request and headers in the data
        data["proxy_server_request"] = {
            "url": str(request.url),
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@ -83,12 +83,13 @@ def test_completion_claude():
 def test_completion_claude_3_empty_response():
    litellm.set_verbose = True
    messages = [
        {
            "role": "system",
            "content": "You are 2twNLGfqk4GMOn3ffp4p.",
        },
-        {"role": "user", "content": "Hi gm!"},
+        {"role": "user", "content": "Hi gm!", "name": "ishaan"},
        {"role": "assistant", "content": "Good morning! How are you doing today?"},
        {
            "role": "user",
--- a/litellm/tests/test_streaming.py
+++ b/litellm/tests/test_streaming.py
@ -511,7 +511,7 @@ def test_completion_mistral_api_stream():
 def test_completion_deep_infra_stream():
-    # deep infra currently includes role in the 2nd chunk
+    # deep infra,currently includes role in the 2nd chunk
    # waiting for them to make a fix on this
    litellm.set_verbose = True
    try:
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -981,6 +981,7 @@ class Logging:
                curl_command = self.model_call_details
            # only print verbose if verbose logger is not set
            if verbose_logger.level == 0:
                # this means verbose logger was not switched on - user is in litellm.set_verbose=True
                print_verbose(f"\033[92m{curl_command}\033[0m\n")
@ -1312,6 +1313,15 @@ class Logging:
            for callback in callbacks:
                try:
                    litellm_params = self.model_call_details.get("litellm_params", {})
                    if litellm_params.get("no-log", False) == True:
                        # proxy cost tracking cal backs should run
                        if not (
                            isinstance(callback, CustomLogger)
                            and "_PROXY_" in callback.__class__.__name__
                        ):
                            print_verbose("no-log request, skipping logging")
                            continue
                    if callback == "lite_debugger":
                        print_verbose("reaches lite_debugger for logging!")
                        print_verbose(f"liteDebuggerClient: {liteDebuggerClient}")
@ -1740,7 +1750,20 @@ class Logging:
            callbacks = litellm._async_success_callback
        verbose_logger.debug(f"Async success callbacks: {callbacks}")
        for callback in callbacks:
            # check if callback can run for this request
            litellm_params = self.model_call_details.get("litellm_params", {})
            if litellm_params.get("no-log", False) == True:
                # proxy cost tracking cal backs should run
                if not (
                    isinstance(callback, CustomLogger)
                    and "_PROXY_" in callback.__class__.__name__
                ):
                    print_verbose("no-log request, skipping logging")
                    continue
            try:
                if kwargs.get("no-log", False) == True:
                    print_verbose("no-log request, skipping logging")
                    continue
                if callback == "cache" and litellm.cache is not None:
                    # set_cache once complete streaming response is built
                    print_verbose("async success_callback: reaches cache for logging!")
@ -3026,11 +3049,13 @@ def client(original_function):
            print_verbose(
                f"Async Wrapper: Completed Call, calling async_success_handler: {logging_obj.async_success_handler}"
            )
            # check if user does not want this to be logged
            asyncio.create_task(
                logging_obj.async_success_handler(result, start_time, end_time)
            )
            threading.Thread(
-                target=logging_obj.success_handler, args=(result, start_time, end_time)
+                target=logging_obj.success_handler,
                args=(result, start_time, end_time),
            ).start()
            # RETURN RESULT
@ -3933,6 +3958,7 @@ def get_litellm_params(
    proxy_server_request=None,
    acompletion=None,
    preset_cache_key=None,
    no_log=None,
 ):
    litellm_params = {
        "acompletion": acompletion,
@ -3949,6 +3975,7 @@ def get_litellm_params(
        "model_info": model_info,
        "proxy_server_request": proxy_server_request,
        "preset_cache_key": preset_cache_key,
        "no-log": no_log,
        "stream_response": {},  # litellm_call_id: ModelResponse Dict
    }
@ -4269,15 +4296,9 @@ def get_optional_params(
    ## raise exception if provider doesn't support passed in param
    if custom_llm_provider == "anthropic":
        ## check if unsupported param passed in
-        supported_params = [
+        supported_params = get_supported_openai_params(
-            "stream",
+            model=model, custom_llm_provider=custom_llm_provider
-            "stop",
+        )
            "temperature",
            "top_p",
            "max_tokens",
            "tools",
            "tool_choice",
        ]
        _check_valid_arg(supported_params=supported_params)
        # handle anthropic params
        if stream:
@ -4301,17 +4322,9 @@ def get_optional_params(
            optional_params["tools"] = tools
    elif custom_llm_provider == "cohere":
        ## check if unsupported param passed in
-        supported_params = [
+        supported_params = get_supported_openai_params(
-            "stream",
+            model=model, custom_llm_provider=custom_llm_provider
-            "temperature",
+        )
            "max_tokens",
            "logit_bias",
            "top_p",
            "frequency_penalty",
            "presence_penalty",
            "stop",
            "n",
        ]
        _check_valid_arg(supported_params=supported_params)
        # handle cohere params
        if stream:
@ -4334,14 +4347,9 @@ def get_optional_params(
            optional_params["stop_sequences"] = stop
    elif custom_llm_provider == "maritalk":
        ## check if unsupported param passed in
-        supported_params = [
+        supported_params = get_supported_openai_params(
-            "stream",
+            model=model, custom_llm_provider=custom_llm_provider
-            "temperature",
+        )
            "max_tokens",
            "top_p",
            "presence_penalty",
            "stop",
        ]
        _check_valid_arg(supported_params=supported_params)
        # handle cohere params
        if stream:
@ -4360,14 +4368,9 @@ def get_optional_params(
            optional_params["stopping_tokens"] = stop
    elif custom_llm_provider == "replicate":
        ## check if unsupported param passed in
-        supported_params = [
+        supported_params = get_supported_openai_params(
-            "stream",
+            model=model, custom_llm_provider=custom_llm_provider
-            "temperature",
+        )
            "max_tokens",
            "top_p",
            "stop",
            "seed",
        ]
        _check_valid_arg(supported_params=supported_params)
        if stream:
@ -4388,7 +4391,9 @@ def get_optional_params(
            optional_params["stop_sequences"] = stop
    elif custom_llm_provider == "huggingface":
        ## check if unsupported param passed in
-        supported_params = ["stream", "temperature", "max_tokens", "top_p", "stop", "n"]
+        supported_params = get_supported_openai_params(
            model=model, custom_llm_provider=custom_llm_provider
        )
        _check_valid_arg(supported_params=supported_params)
        # temperature, top_p, n, stream, stop, max_tokens, n, presence_penalty default to None
        if temperature is not None:
@ -4427,16 +4432,9 @@ def get_optional_params(
            )  # since we handle translating echo, we should not send it to TGI request
    elif custom_llm_provider == "together_ai":
        ## check if unsupported param passed in
-        supported_params = [
+        supported_params = get_supported_openai_params(
-            "stream",
+            model=model, custom_llm_provider=custom_llm_provider
-            "temperature",
+        )
            "max_tokens",
            "top_p",
            "stop",
            "frequency_penalty",
            "tools",
            "tool_choice",
        ]
        _check_valid_arg(supported_params=supported_params)
        if stream:
@ -4457,16 +4455,9 @@ def get_optional_params(
            optional_params["tool_choice"] = tool_choice
    elif custom_llm_provider == "ai21":
        ## check if unsupported param passed in
-        supported_params = [
+        supported_params = get_supported_openai_params(
-            "stream",
+            model=model, custom_llm_provider=custom_llm_provider
-            "n",
+        )
            "temperature",
            "max_tokens",
            "top_p",
            "stop",
            "frequency_penalty",
            "presence_penalty",
        ]
        _check_valid_arg(supported_params=supported_params)
        if stream:
@ -4489,7 +4480,9 @@ def get_optional_params(
        custom_llm_provider == "palm" or custom_llm_provider == "gemini"
    ):  # https://developers.generativeai.google/tutorials/curl_quickstart
        ## check if unsupported param passed in
-        supported_params = ["temperature", "top_p", "stream", "n", "stop", "max_tokens"]
+        supported_params = get_supported_openai_params(
            model=model, custom_llm_provider=custom_llm_provider
        )
        _check_valid_arg(supported_params=supported_params)
        if temperature is not None:
@ -4518,14 +4511,9 @@ def get_optional_params(
    ):
        print_verbose(f"(start) INSIDE THE VERTEX AI OPTIONAL PARAM BLOCK")
        ## check if unsupported param passed in
-        supported_params = [
+        supported_params = get_supported_openai_params(
-            "temperature",
+            model=model, custom_llm_provider=custom_llm_provider
-            "top_p",
+        )
            "max_tokens",
            "stream",
            "tools",
            "tool_choice",
        ]
        _check_valid_arg(supported_params=supported_params)
        if temperature is not None:
@ -4555,7 +4543,9 @@ def get_optional_params(
        )
    elif custom_llm_provider == "sagemaker":
        ## check if unsupported param passed in
-        supported_params = ["stream", "temperature", "max_tokens", "top_p", "stop", "n"]
+        supported_params = get_supported_openai_params(
            model=model, custom_llm_provider=custom_llm_provider
        )
        _check_valid_arg(supported_params=supported_params)
        # temperature, top_p, n, stream, stop, max_tokens, n, presence_penalty default to None
        if temperature is not None:
@ -4582,8 +4572,10 @@ def get_optional_params(
                max_tokens = 1
            optional_params["max_new_tokens"] = max_tokens
    elif custom_llm_provider == "bedrock":
        supported_params = get_supported_openai_params(
            model=model, custom_llm_provider=custom_llm_provider
        )
        if "ai21" in model:
            supported_params = ["max_tokens", "temperature", "top_p", "stream"]
            _check_valid_arg(supported_params=supported_params)
            # params "maxTokens":200,"temperature":0,"topP":250,"stop_sequences":[],
            # https://us-west-2.console.aws.amazon.com/bedrock/home?region=us-west-2#/providers?model=j2-ultra
@ -4596,9 +4588,6 @@ def get_optional_params(
            if stream:
                optional_params["stream"] = stream
        elif "anthropic" in model:
            supported_params = get_mapped_model_params(
                model=model, custom_llm_provider=custom_llm_provider
            )
            _check_valid_arg(supported_params=supported_params)
            # anthropic params on bedrock
            # \"max_tokens_to_sample\":300,\"temperature\":0.5,\"top_p\":1,\"stop_sequences\":[\"\\\\n\\\\nHuman:\"]}"
@ -4615,7 +4604,6 @@ def get_optional_params(
                    optional_params=optional_params,
                )
        elif "amazon" in model:  # amazon titan llms
            supported_params = ["max_tokens", "temperature", "stop", "top_p", "stream"]
            _check_valid_arg(supported_params=supported_params)
            # see https://us-west-2.console.aws.amazon.com/bedrock/home?region=us-west-2#/providers?model=titan-large
            if max_tokens is not None:
@ -4632,7 +4620,6 @@ def get_optional_params(
            if stream:
                optional_params["stream"] = stream
        elif "meta" in model:  # amazon / meta llms
            supported_params = ["max_tokens", "temperature", "top_p", "stream"]
            _check_valid_arg(supported_params=supported_params)
            # see https://us-west-2.console.aws.amazon.com/bedrock/home?region=us-west-2#/providers?model=titan-large
            if max_tokens is not None:
@ -4644,7 +4631,6 @@ def get_optional_params(
            if stream:
                optional_params["stream"] = stream
        elif "cohere" in model:  # cohere models on bedrock
            supported_params = ["stream", "temperature", "max_tokens"]
            _check_valid_arg(supported_params=supported_params)
            # handle cohere params
            if stream:
@ -4654,7 +4640,6 @@ def get_optional_params(
            if max_tokens is not None:
                optional_params["max_tokens"] = max_tokens
        elif "mistral" in model:
            supported_params = ["max_tokens", "temperature", "stop", "top_p", "stream"]
            _check_valid_arg(supported_params=supported_params)
            # mistral params on bedrock
            # \"max_tokens\":400,\"temperature\":0.7,\"top_p\":0.7,\"stop\":[\"\\\\n\\\\nHuman:\"]}"
@ -4698,7 +4683,9 @@ def get_optional_params(
            optional_params["stop_sequences"] = stop
    elif custom_llm_provider == "cloudflare":
        # https://developers.cloudflare.com/workers-ai/models/text-generation/#input
-        supported_params = ["max_tokens", "stream"]
+        supported_params = get_supported_openai_params(
            model=model, custom_llm_provider=custom_llm_provider
        )
        _check_valid_arg(supported_params=supported_params)
        if max_tokens is not None:
@ -4706,14 +4693,9 @@ def get_optional_params(
        if stream is not None:
            optional_params["stream"] = stream
    elif custom_llm_provider == "ollama":
-        supported_params = [
+        supported_params = get_supported_openai_params(
-            "max_tokens",
+            model=model, custom_llm_provider=custom_llm_provider
-            "stream",
+        )
            "top_p",
            "temperature",
            "frequency_penalty",
            "stop",
        ]
        _check_valid_arg(supported_params=supported_params)
        if max_tokens is not None:
@ -4737,16 +4719,9 @@ def get_optional_params(
            non_default_params=non_default_params, optional_params=optional_params
        )
    elif custom_llm_provider == "nlp_cloud":
-        supported_params = [
+        supported_params = get_supported_openai_params(
-            "max_tokens",
+            model=model, custom_llm_provider=custom_llm_provider
-            "stream",
+        )
            "temperature",
            "top_p",
            "presence_penalty",
            "frequency_penalty",
            "n",
            "stop",
        ]
        _check_valid_arg(supported_params=supported_params)
        if max_tokens is not None:
@ -4766,7 +4741,9 @@ def get_optional_params(
        if stop is not None:
            optional_params["stop_sequences"] = stop
    elif custom_llm_provider == "petals":
-        supported_params = ["max_tokens", "temperature", "top_p", "stream"]
+        supported_params = get_supported_openai_params(
            model=model, custom_llm_provider=custom_llm_provider
        )
        _check_valid_arg(supported_params=supported_params)
        # max_new_tokens=1,temperature=0.9, top_p=0.6
        if max_tokens is not None:
@ -4778,18 +4755,9 @@ def get_optional_params(
        if stream:
            optional_params["stream"] = stream
    elif custom_llm_provider == "deepinfra":
-        supported_params = [
+        supported_params = get_supported_openai_params(
-            "temperature",
+            model=model, custom_llm_provider=custom_llm_provider
-            "top_p",
+        )
            "n",
            "stream",
            "stop",
            "max_tokens",
            "presence_penalty",
            "frequency_penalty",
            "logit_bias",
            "user",
        ]
        _check_valid_arg(supported_params=supported_params)
        if temperature is not None:
            if (
@ -4816,14 +4784,9 @@ def get_optional_params(
        if user:
            optional_params["user"] = user
    elif custom_llm_provider == "perplexity":
-        supported_params = [
+        supported_params = get_supported_openai_params(
-            "temperature",
+            model=model, custom_llm_provider=custom_llm_provider
-            "top_p",
+        )
            "stream",
            "max_tokens",
            "presence_penalty",
            "frequency_penalty",
        ]
        _check_valid_arg(supported_params=supported_params)
        if temperature is not None:
            if (
@ -4842,15 +4805,9 @@ def get_optional_params(
        if frequency_penalty:
            optional_params["frequency_penalty"] = frequency_penalty
    elif custom_llm_provider == "anyscale":
-        supported_params = [
+        supported_params = get_supported_openai_params(
-            "temperature",
+            model=model, custom_llm_provider=custom_llm_provider
-            "top_p",
+        )
            "stream",
            "max_tokens",
            "stop",
            "frequency_penalty",
            "presence_penalty",
        ]
        if model in [
            "mistralai/Mistral-7B-Instruct-v0.1",
            "mistralai/Mixtral-8x7B-Instruct-v0.1",
@ -4878,14 +4835,9 @@ def get_optional_params(
        if max_tokens:
            optional_params["max_tokens"] = max_tokens
    elif custom_llm_provider == "mistral":
-        supported_params = [
+        supported_params = get_supported_openai_params(
-            "temperature",
+            model=model, custom_llm_provider=custom_llm_provider
-            "top_p",
+        )
            "stream",
            "max_tokens",
            "tools",
            "tool_choice",
        ]
        _check_valid_arg(supported_params=supported_params)
        if temperature is not None:
            optional_params["temperature"] = temperature
@ -4912,25 +4864,9 @@ def get_optional_params(
            extra_body  # openai client supports `extra_body` param
        )
    elif custom_llm_provider == "openrouter":
-        supported_params = [
+        supported_params = get_supported_openai_params(
-            "functions",
+            model=model, custom_llm_provider=custom_llm_provider
-            "function_call",
+        )
            "temperature",
            "top_p",
            "n",
            "stream",
            "stop",
            "max_tokens",
            "presence_penalty",
            "frequency_penalty",
            "logit_bias",
            "user",
            "response_format",
            "seed",
            "tools",
            "tool_choice",
            "max_retries",
        ]
        _check_valid_arg(supported_params=supported_params)
        if functions is not None:
@ -4984,28 +4920,9 @@ def get_optional_params(
        )
    else:  # assume passing in params for openai/azure openai
        print_verbose(f"UNMAPPED PROVIDER, ASSUMING IT'S OPENAI/AZURE")
-        supported_params = [
+        supported_params = get_supported_openai_params(
-            "functions",
+            model=model, custom_llm_provider="openai"
-            "function_call",
+        )
            "temperature",
            "top_p",
            "n",
            "stream",
            "stop",
            "max_tokens",
            "presence_penalty",
            "frequency_penalty",
            "logit_bias",
            "user",
            "response_format",
            "seed",
            "tools",
            "tool_choice",
            "max_retries",
            "logprobs",
            "top_logprobs",
            "extra_headers",
        ]
        _check_valid_arg(supported_params=supported_params)
        if functions is not None:
            optional_params["functions"] = functions
@ -5063,15 +4980,228 @@ def get_optional_params(
    return optional_params
-def get_mapped_model_params(model: str, custom_llm_provider: str):
+def get_supported_openai_params(model: str, custom_llm_provider: str):
    """
    Returns the supported openai params for a given model + provider
    Example:
    ```
    get_supported_openai_params(model="anthropic.claude-3", custom_llm_provider="bedrock")
    ```
    """
    if custom_llm_provider == "bedrock":
        if model.startswith("anthropic.claude-3"):
            return litellm.AmazonAnthropicClaude3Config().get_supported_openai_params()
-        else:
+        elif model.startswith("anthropic"):
            return litellm.AmazonAnthropicConfig().get_supported_openai_params()
        elif model.startswith("ai21"):
            return ["max_tokens", "temperature", "top_p", "stream"]
        elif model.startswith("amazon"):
            return ["max_tokens", "temperature", "stop", "top_p", "stream"]
        elif model.startswith("meta"):
            return ["max_tokens", "temperature", "top_p", "stream"]
        elif model.startswith("cohere"):
            return ["stream", "temperature", "max_tokens"]
        elif model.startswith("mistral"):
            return ["max_tokens", "temperature", "stop", "top_p", "stream"]
    elif custom_llm_provider == "ollama_chat":
        return litellm.OllamaChatConfig().get_supported_openai_params()
    elif custom_llm_provider == "anthropic":
        return [
            "stream",
            "stop",
            "temperature",
            "top_p",
            "max_tokens",
            "tools",
            "tool_choice",
        ]
    elif custom_llm_provider == "cohere":
        return [
            "stream",
            "temperature",
            "max_tokens",
            "logit_bias",
            "top_p",
            "frequency_penalty",
            "presence_penalty",
            "stop",
            "n",
        ]
    elif custom_llm_provider == "maritalk":
        return [
            "stream",
            "temperature",
            "max_tokens",
            "top_p",
            "presence_penalty",
            "stop",
        ]
    elif custom_llm_provider == "openai" or custom_llm_provider == "azure":
        return [
            "functions",
            "function_call",
            "temperature",
            "top_p",
            "n",
            "stream",
            "stop",
            "max_tokens",
            "presence_penalty",
            "frequency_penalty",
            "logit_bias",
            "user",
            "response_format",
            "seed",
            "tools",
            "tool_choice",
            "max_retries",
            "logprobs",
            "top_logprobs",
            "extra_headers",
        ]
    elif custom_llm_provider == "openrouter":
        return [
            "functions",
            "function_call",
            "temperature",
            "top_p",
            "n",
            "stream",
            "stop",
            "max_tokens",
            "presence_penalty",
            "frequency_penalty",
            "logit_bias",
            "user",
            "response_format",
            "seed",
            "tools",
            "tool_choice",
            "max_retries",
        ]
    elif custom_llm_provider == "mistral":
        return [
            "temperature",
            "top_p",
            "stream",
            "max_tokens",
            "tools",
            "tool_choice",
        ]
    elif custom_llm_provider == "replicate":
        return [
            "stream",
            "temperature",
            "max_tokens",
            "top_p",
            "stop",
            "seed",
        ]
    elif custom_llm_provider == "huggingface":
        return ["stream", "temperature", "max_tokens", "top_p", "stop", "n"]
    elif custom_llm_provider == "together_ai":
        return [
            "stream",
            "temperature",
            "max_tokens",
            "top_p",
            "stop",
            "frequency_penalty",
            "tools",
            "tool_choice",
        ]
    elif custom_llm_provider == "ai21":
        return [
            "stream",
            "n",
            "temperature",
            "max_tokens",
            "top_p",
            "stop",
            "frequency_penalty",
            "presence_penalty",
        ]
    elif custom_llm_provider == "palm" or custom_llm_provider == "gemini":
        return ["temperature", "top_p", "stream", "n", "stop", "max_tokens"]
    elif custom_llm_provider == "vertex_ai":
        return [
            "temperature",
            "top_p",
            "max_tokens",
            "stream",
            "tools",
            "tool_choice",
        ]
    elif custom_llm_provider == "sagemaker":
        return ["stream", "temperature", "max_tokens", "top_p", "stop", "n"]
    elif custom_llm_provider == "aleph_alpha":
        return [
            "max_tokens",
            "stream",
            "top_p",
            "temperature",
            "presence_penalty",
            "frequency_penalty",
            "n",
            "stop",
        ]
    elif custom_llm_provider == "cloudflare":
        return ["max_tokens", "stream"]
    elif custom_llm_provider == "ollama":
        return [
            "max_tokens",
            "stream",
            "top_p",
            "temperature",
            "frequency_penalty",
            "stop",
        ]
    elif custom_llm_provider == "nlp_cloud":
        return [
            "max_tokens",
            "stream",
            "temperature",
            "top_p",
            "presence_penalty",
            "frequency_penalty",
            "n",
            "stop",
        ]
    elif custom_llm_provider == "petals":
        return ["max_tokens", "temperature", "top_p", "stream"]
    elif custom_llm_provider == "deepinfra":
        return [
            "temperature",
            "top_p",
            "n",
            "stream",
            "stop",
            "max_tokens",
            "presence_penalty",
            "frequency_penalty",
            "logit_bias",
            "user",
        ]
    elif custom_llm_provider == "perplexity":
        return [
            "temperature",
            "top_p",
            "stream",
            "max_tokens",
            "presence_penalty",
            "frequency_penalty",
        ]
    elif custom_llm_provider == "anyscale":
        return [
            "temperature",
            "top_p",
            "stream",
            "max_tokens",
            "stop",
            "frequency_penalty",
            "presence_penalty",
        ]
 def get_llm_provider(
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "litellm"
-version = "1.30.3"
+version = "1.30.4"
 description = "Library to easily interface with LLM API providers"
 authors = ["BerriAI"]
 license = "MIT"
@ -74,7 +74,7 @@ requires = ["poetry-core", "wheel"]
 build-backend = "poetry.core.masonry.api"
 [tool.commitizen]
-version = "1.30.3"
+version = "1.30.4"
 version_files = [
    "pyproject.toml:^version"
 ]
--- a/tests/test_whisper.py
+++ b/tests/test_whisper.py
@ -9,6 +9,7 @@ import sys, os, dotenv
 from typing import Optional
 from dotenv import load_dotenv
 # Get the current directory of the file being run
 pwd = os.path.dirname(os.path.realpath(__file__))
 print(pwd)
@ -37,12 +38,13 @@ def test_transcription():
 def test_transcription_azure():
    litellm.set_verbose = True
    transcript = litellm.transcription(
        model="azure/azure-whisper",
        file=audio_file,
-        api_base=os.getenv("AZURE_EUROPE_API_BASE"),
+        api_base="https://my-endpoint-europe-berri-992.openai.azure.com/",
        api_key=os.getenv("AZURE_EUROPE_API_KEY"),
-        api_version=os.getenv("2024-02-15-preview"),
+        api_version="2024-02-15-preview",
    )
    assert transcript.text is not None
@ -57,9 +59,9 @@ async def test_transcription_async_azure():
    transcript = await litellm.atranscription(
        model="azure/azure-whisper",
        file=audio_file,
-        api_base=os.getenv("AZURE_EUROPE_API_BASE"),
+        api_base="https://my-endpoint-europe-berri-992.openai.azure.com/",
        api_key=os.getenv("AZURE_EUROPE_API_KEY"),
-        api_version=os.getenv("2024-02-15-preview"),
+        api_version="2024-02-15-preview",
    )
    assert transcript.text is not None
@ -96,7 +98,7 @@ async def test_transcription_on_router():
                "model_name": "whisper",
                "litellm_params": {
                    "model": "azure/azure-whisper",
-                    "api_base": os.getenv("AZURE_EUROPE_API_BASE"),
+                    "api_base": "https://my-endpoint-europe-berri-992.openai.azure.com/",
                    "api_key": os.getenv("AZURE_EUROPE_API_KEY"),
                    "api_version": "2024-02-15-preview",
                },