diff --git a/docs/my-website/docs/load_test.md b/docs/my-website/docs/load_test.md
index f568b56961..94165fb7b2 100644
--- a/docs/my-website/docs/load_test.md
+++ b/docs/my-website/docs/load_test.md
@@ -1,5 +1,84 @@
+import Image from '@theme/IdealImage';
+
# 🔥 Load Test LiteLLM
+## Load Test LiteLLM Proxy - 1500+ req/s
+
+## 1500+ concurrent requests/s
+
+LiteLLM proxy has been load tested to handle 1500+ concurrent req/s
+
+```python
+import time, asyncio
+from openai import AsyncOpenAI, AsyncAzureOpenAI
+import uuid
+import traceback
+
+# base_url - litellm proxy endpoint
+# api_key - litellm proxy api-key, is created proxy with auth
+litellm_client = AsyncOpenAI(base_url="http://0.0.0.0:4000", api_key="sk-1234")
+
+
+async def litellm_completion():
+ # Your existing code for litellm_completion goes here
+ try:
+ response = await litellm_client.chat.completions.create(
+ model="azure-gpt-3.5",
+ messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
+ )
+ print(response)
+ return response
+
+ except Exception as e:
+ # If there's an exception, log the error message
+ with open("error_log.txt", "a") as error_log:
+ error_log.write(f"Error during completion: {str(e)}\n")
+ pass
+
+
+async def main():
+ for i in range(1):
+ start = time.time()
+ n = 1500 # Number of concurrent tasks
+ tasks = [litellm_completion() for _ in range(n)]
+
+ chat_completions = await asyncio.gather(*tasks)
+
+ successful_completions = [c for c in chat_completions if c is not None]
+
+ # Write errors to error_log.txt
+ with open("error_log.txt", "a") as error_log:
+ for completion in chat_completions:
+ if isinstance(completion, str):
+ error_log.write(completion + "\n")
+
+ print(n, time.time() - start, len(successful_completions))
+ time.sleep(10)
+
+
+if __name__ == "__main__":
+ # Blank out contents of error_log.txt
+ open("error_log.txt", "w").close()
+
+ asyncio.run(main())
+
+```
+
+### Throughput - 30% Increase
+LiteLLM proxy + Load Balancer gives **30% increase** in throughput compared to Raw OpenAI API
+
+
+### Latency Added - 0.00325 seconds
+LiteLLM proxy adds **0.00325 seconds** latency as compared to using the Raw OpenAI API
+
+
+
+### Testing LiteLLM Proxy with Locust
+- 1 LiteLLM container can handle ~140 requests/second with 0.4 failures
+
+
+
+## Load Test LiteLLM SDK vs OpenAI
Here is a script to load test LiteLLM vs OpenAI
```python
@@ -84,4 +163,5 @@ async def loadtest_fn():
# Run the event loop to execute the async function
asyncio.run(loadtest_fn())
-```
\ No newline at end of file
+```
+
diff --git a/docs/my-website/docs/proxy/configs.md b/docs/my-website/docs/proxy/configs.md
index 5745685de8..a863ec2cac 100644
--- a/docs/my-website/docs/proxy/configs.md
+++ b/docs/my-website/docs/proxy/configs.md
@@ -49,9 +49,9 @@ model_list:
rpm: 6
- model_name: anthropic-claude
litellm_params:
- model="bedrock/anthropic.claude-instant-v1"
+ model: bedrock/anthropic.claude-instant-v1
### [OPTIONAL] SET AWS REGION ###
- aws_region_name="us-east-1"
+ aws_region_name: us-east-1
- model_name: vllm-models
litellm_params:
model: openai/facebook/opt-125m # the `openai/` prefix tells litellm it's openai compatible
diff --git a/docs/my-website/docs/proxy/deploy.md b/docs/my-website/docs/proxy/deploy.md
index 6de8625d03..496bde05f1 100644
--- a/docs/my-website/docs/proxy/deploy.md
+++ b/docs/my-website/docs/proxy/deploy.md
@@ -68,6 +68,72 @@ CMD ["--port", "4000", "--config", "config.yaml", "--detailed_debug", "--run_gun
+
+
+Deploying a config file based litellm instance just requires a simple deployment that loads
+the config.yaml file via a config map. Also it would be a good practice to use the env var
+declaration for api keys, and attach the env vars with the api key values as an opaque secret.
+
+```yaml
+apiVersion: v1
+kind: ConfigMap
+metadata:
+ name: litellm-config-file
+data:
+ config.yaml: |
+ model_list:
+ - model_name: gpt-3.5-turbo
+ litellm_params:
+ model: azure/gpt-turbo-small-ca
+ api_base: https://my-endpoint-canada-berri992.openai.azure.com/
+ api_key: os.environ/CA_AZURE_OPENAI_API_KEY
+---
+apiVersion: v1
+kind: Secret
+type: Opaque
+metadata:
+ name: litellm-secrets
+data:
+ CA_AZURE_OPENAI_API_KEY: bWVvd19pbV9hX2NhdA== # your api key in base64
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ name: litellm-deployment
+ labels:
+ app: litellm
+spec:
+ selector:
+ matchLabels:
+ app: litellm
+ template:
+ metadata:
+ labels:
+ app: litellm
+ spec:
+ containers:
+ - name: litellm
+ image: ghcr.io/berriai/litellm:main-latest # it is recommended to fix a version generally
+ ports:
+ - containerPort: 4000
+ volumeMounts:
+ - name: config-volume
+ mountPath: /app/proxy_server_config.yaml
+ subPath: config.yaml
+ envFrom:
+ - secretRef:
+ name: litellm-secrets
+ volumes:
+ - name: config-volume
+ configMap:
+ name: litellm-config-file
+```
+
+> [!TIP]
+> To avoid issues with predictability, difficulties in rollback, and inconsistent environments, use versioning or SHA digests (for example, `litellm:main-v1.30.3` or `litellm@sha256:12345abcdef...`) instead of `litellm:main-latest`.
+
+
+
## Deploy with Database
@@ -350,17 +416,3 @@ Run the command `docker-compose up` or `docker compose up` as per your docker in
Your LiteLLM container should be running now on the defined port e.g. `8000`.
-
-
-
-## LiteLLM Proxy Performance
-
-LiteLLM proxy has been load tested to handle 1500 req/s.
-
-### Throughput - 30% Increase
-LiteLLM proxy + Load Balancer gives **30% increase** in throughput compared to Raw OpenAI API
-
-
-### Latency Added - 0.00325 seconds
-LiteLLM proxy adds **0.00325 seconds** latency as compared to using the Raw OpenAI API
-
diff --git a/docs/my-website/docs/proxy/enterprise.md b/docs/my-website/docs/proxy/enterprise.md
index a4f3ea7b17..e0c5374f0e 100644
--- a/docs/my-website/docs/proxy/enterprise.md
+++ b/docs/my-website/docs/proxy/enterprise.md
@@ -12,14 +12,16 @@ Features here are behind a commercial license in our `/enterprise` folder. [**Se
:::
Features:
-- [ ] Content Moderation with LlamaGuard
-- [ ] Content Moderation with Google Text Moderations
-- [ ] Content Moderation with LLM Guard
-- [ ] Reject calls from Blocked User list
-- [ ] Reject calls (incoming / outgoing) with Banned Keywords (e.g. competitors)
-- [ ] Tracking Spend for Custom Tags
+- ✅ Content Moderation with LlamaGuard
+- ✅ Content Moderation with Google Text Moderations
+- ✅ Content Moderation with LLM Guard
+- ✅ Reject calls from Blocked User list
+- ✅ Reject calls (incoming / outgoing) with Banned Keywords (e.g. competitors)
+- ✅ Don't log/store specific requests (eg confidential LLM requests)
+- ✅ Tracking Spend for Custom Tags
-## Content Moderation with LlamaGuard
+## Content Moderation
+### Content Moderation with LlamaGuard
Currently works with Sagemaker's LlamaGuard endpoint.
@@ -39,7 +41,7 @@ os.environ["AWS_SECRET_ACCESS_KEY"] = ""
os.environ["AWS_REGION_NAME"] = ""
```
-### Customize LlamaGuard prompt
+#### Customize LlamaGuard prompt
To modify the unsafe categories llama guard evaluates against, just create your own version of [this category list](https://github.com/BerriAI/litellm/blob/main/litellm/proxy/llamaguard_prompt.txt)
@@ -51,7 +53,7 @@ callbacks: ["llamaguard_moderations"]
llamaguard_unsafe_content_categories: /path/to/llamaguard_prompt.txt
```
-## Content Moderation with LLM Guard
+### Content Moderation with LLM Guard
Set the LLM Guard API Base in your environment
@@ -78,7 +80,7 @@ Expected results:
LLM Guard: Received response - {"sanitized_prompt": "hello world", "is_valid": true, "scanners": { "Regex": 0.0 }}
```
-## Content Moderation with Google Text Moderation
+### Content Moderation with Google Text Moderation
Requires your GOOGLE_APPLICATION_CREDENTIALS to be set in your .env (same as VertexAI).
@@ -89,7 +91,7 @@ litellm_settings:
callbacks: ["google_text_moderation"]
```
-### Set custom confidence thresholds
+#### Set custom confidence thresholds
Google Moderations checks the test against several categories. [Source](https://cloud.google.com/natural-language/docs/moderating-text#safety_attribute_confidence_scores)
@@ -133,6 +135,33 @@ Here are the category specific values:
| "legal" | legal_threshold: 0.1 |
+## Incognito Requests - Don't log anything
+
+When `no-log=True`, the request will **not be logged on any callbacks** and there will be **no server logs on litellm**
+
+```python
+import openai
+client = openai.OpenAI(
+ api_key="anything", # proxy api-key
+ base_url="http://0.0.0.0:8000" # litellm proxy
+)
+
+response = client.chat.completions.create(
+ model="gpt-3.5-turbo",
+ messages = [
+ {
+ "role": "user",
+ "content": "this is a test request, write a short poem"
+ }
+ ],
+ extra_body={
+ "no-log": True
+ }
+)
+
+print(response)
+```
+
## Enable Blocked User Lists
If any call is made to proxy with this user id, it'll be rejected - use this if you want to let users opt-out of ai features
diff --git a/docs/my-website/img/locust.png b/docs/my-website/img/locust.png
new file mode 100644
index 0000000000..1bcedf1d04
Binary files /dev/null and b/docs/my-website/img/locust.png differ
diff --git a/litellm/__init__.py b/litellm/__init__.py
index 506147166e..04c2d23c79 100644
--- a/litellm/__init__.py
+++ b/litellm/__init__.py
@@ -570,7 +570,7 @@ from .utils import (
_calculate_retry_after,
_should_retry,
get_secret,
- get_mapped_model_params,
+ get_supported_openai_params,
)
from .llms.huggingface_restapi import HuggingfaceConfig
from .llms.anthropic import AnthropicConfig
diff --git a/litellm/_logging.py b/litellm/_logging.py
index 438fa9743d..26693c15ec 100644
--- a/litellm/_logging.py
+++ b/litellm/_logging.py
@@ -31,6 +31,18 @@ def _turn_on_debug():
verbose_proxy_logger.setLevel(level=logging.DEBUG) # set proxy logs to debug
+def _disable_debugging():
+ verbose_logger.disabled = True
+ verbose_router_logger.disabled = True
+ verbose_proxy_logger.disabled = True
+
+
+def _enable_debugging():
+ verbose_logger.disabled = False
+ verbose_router_logger.disabled = False
+ verbose_proxy_logger.disabled = False
+
+
def print_verbose(print_statement):
try:
if set_verbose:
diff --git a/litellm/llms/azure.py b/litellm/llms/azure.py
index 022266996d..5fc0939bbc 100644
--- a/litellm/llms/azure.py
+++ b/litellm/llms/azure.py
@@ -15,6 +15,7 @@ import litellm, json
import httpx
from .custom_httpx.azure_dall_e_2 import CustomHTTPTransport, AsyncCustomHTTPTransport
from openai import AzureOpenAI, AsyncAzureOpenAI
+import uuid
class AzureOpenAIError(Exception):
@@ -271,6 +272,14 @@ class AzureChatCompletion(BaseLLM):
azure_client = AzureOpenAI(**azure_client_params)
else:
azure_client = client
+ if api_version is not None and isinstance(
+ azure_client._custom_query, dict
+ ):
+ # set api_version to version passed by user
+ azure_client._custom_query.setdefault(
+ "api-version", api_version
+ )
+
response = azure_client.chat.completions.create(**data, timeout=timeout) # type: ignore
stringified_response = response.model_dump()
## LOGGING
@@ -334,10 +343,17 @@ class AzureChatCompletion(BaseLLM):
azure_client_params["api_key"] = api_key
elif azure_ad_token is not None:
azure_client_params["azure_ad_token"] = azure_ad_token
+
+ # setting Azure client
if client is None:
azure_client = AsyncAzureOpenAI(**azure_client_params)
else:
azure_client = client
+ if api_version is not None and isinstance(
+ azure_client._custom_query, dict
+ ):
+ # set api_version to version passed by user
+ azure_client._custom_query.setdefault("api-version", api_version)
## LOGGING
logging_obj.pre_call(
input=data["messages"],
@@ -402,6 +418,9 @@ class AzureChatCompletion(BaseLLM):
azure_client = AzureOpenAI(**azure_client_params)
else:
azure_client = client
+ if api_version is not None and isinstance(azure_client._custom_query, dict):
+ # set api_version to version passed by user
+ azure_client._custom_query.setdefault("api-version", api_version)
## LOGGING
logging_obj.pre_call(
input=data["messages"],
@@ -455,6 +474,11 @@ class AzureChatCompletion(BaseLLM):
azure_client = AsyncAzureOpenAI(**azure_client_params)
else:
azure_client = client
+ if api_version is not None and isinstance(
+ azure_client._custom_query, dict
+ ):
+ # set api_version to version passed by user
+ azure_client._custom_query.setdefault("api-version", api_version)
## LOGGING
logging_obj.pre_call(
input=data["messages"],
@@ -813,6 +837,19 @@ class AzureChatCompletion(BaseLLM):
azure_client = AzureOpenAI(http_client=litellm.client_session, **azure_client_params) # type: ignore
else:
azure_client = client
+
+ ## LOGGING
+ logging_obj.pre_call(
+ input=f"audio_file_{uuid.uuid4()}",
+ api_key=azure_client.api_key,
+ additional_args={
+ "headers": {"Authorization": f"Bearer {azure_client.api_key}"},
+ "api_base": azure_client._base_url._uri_reference,
+ "atranscription": True,
+ "complete_input_dict": data,
+ },
+ )
+
response = azure_client.audio.transcriptions.create(
**data, timeout=timeout # type: ignore
)
@@ -850,6 +887,20 @@ class AzureChatCompletion(BaseLLM):
else:
async_azure_client = client
+ ## LOGGING
+ logging_obj.pre_call(
+ input=f"audio_file_{uuid.uuid4()}",
+ api_key=async_azure_client.api_key,
+ additional_args={
+ "headers": {
+ "Authorization": f"Bearer {async_azure_client.api_key}"
+ },
+ "api_base": async_azure_client._base_url._uri_reference,
+ "atranscription": True,
+ "complete_input_dict": data,
+ },
+ )
+
response = await async_azure_client.audio.transcriptions.create(
**data, timeout=timeout
) # type: ignore
diff --git a/litellm/llms/openai.py b/litellm/llms/openai.py
index a90d2457a1..64c0aa3afd 100644
--- a/litellm/llms/openai.py
+++ b/litellm/llms/openai.py
@@ -238,14 +238,22 @@ class OpenAIChatCompletion(BaseLLM):
status_code=422, message=f"Timeout needs to be a float"
)
- if custom_llm_provider == "mistral":
- # check if message content passed in as list, and not string
- messages = prompt_factory(
- model=model,
- messages=messages,
- custom_llm_provider=custom_llm_provider,
- )
-
+ if custom_llm_provider != "openai":
+ # process all OpenAI compatible provider logic here
+ if custom_llm_provider == "mistral":
+ # check if message content passed in as list, and not string
+ messages = prompt_factory(
+ model=model,
+ messages=messages,
+ custom_llm_provider=custom_llm_provider,
+ )
+ if custom_llm_provider == "perplexity" and messages is not None:
+ # check if messages.name is passed + supported, if not supported remove
+ messages = prompt_factory(
+ model=model,
+ messages=messages,
+ custom_llm_provider=custom_llm_provider,
+ )
for _ in range(
2
): # if call fails due to alternating messages, retry with reformatted message
diff --git a/litellm/llms/prompt_templates/factory.py b/litellm/llms/prompt_templates/factory.py
index 616833a2ec..a13130c62a 100644
--- a/litellm/llms/prompt_templates/factory.py
+++ b/litellm/llms/prompt_templates/factory.py
@@ -556,6 +556,7 @@ def anthropic_messages_pt(messages: list):
3. Each message must alternate between "user" and "assistant" (this is not addressed as now by litellm)
4. final assistant content cannot end with trailing whitespace (anthropic raises an error otherwise)
5. System messages are a separate param to the Messages API (used for tool calling)
+ 6. Ensure we only accept role, content. (message.name is not supported)
"""
## Ensure final assistant message has no trailing whitespace
last_assistant_message_idx: Optional[int] = None
@@ -583,7 +584,9 @@ def anthropic_messages_pt(messages: list):
new_content.append({"type": "text", "text": m["text"]})
new_messages.append({"role": messages[0]["role"], "content": new_content}) # type: ignore
else:
- new_messages.append(messages[0])
+ new_messages.append(
+ {"role": messages[0]["role"], "content": messages[0]["content"]}
+ )
return new_messages
@@ -606,7 +609,9 @@ def anthropic_messages_pt(messages: list):
new_content.append({"type": "text", "content": m["text"]})
new_messages.append({"role": messages[i]["role"], "content": new_content}) # type: ignore
else:
- new_messages.append(messages[i])
+ new_messages.append(
+ {"role": messages[i]["role"], "content": messages[i]["content"]}
+ )
if messages[i]["role"] == messages[i + 1]["role"]:
if messages[i]["role"] == "user":
@@ -897,6 +902,10 @@ def prompt_factory(
return anthropic_pt(messages=messages)
elif "mistral." in model:
return mistral_instruct_pt(messages=messages)
+ elif custom_llm_provider == "perplexity":
+ for message in messages:
+ message.pop("name", None)
+ return messages
try:
if "meta-llama/llama-2" in model and "chat" in model:
return llama_2_chat_pt(messages=messages)
diff --git a/litellm/main.py b/litellm/main.py
index 6deaf653f6..114b469488 100644
--- a/litellm/main.py
+++ b/litellm/main.py
@@ -488,6 +488,8 @@ def completion(
### ASYNC CALLS ###
acompletion = kwargs.get("acompletion", False)
client = kwargs.get("client", None)
+ ### Admin Controls ###
+ no_log = kwargs.get("no-log", False)
######## end of unpacking kwargs ###########
openai_params = [
"functions",
@@ -564,6 +566,7 @@ def completion(
"caching_groups",
"ttl",
"cache",
+ "no-log",
]
default_params = openai_params + litellm_params
non_default_params = {
@@ -727,6 +730,7 @@ def completion(
model_info=model_info,
proxy_server_request=proxy_server_request,
preset_cache_key=preset_cache_key,
+ no_log=no_log,
)
logging.update_environment_variables(
model=model,
@@ -2418,6 +2422,7 @@ def embedding(
"caching_groups",
"ttl",
"cache",
+ "no-log",
]
default_params = openai_params + litellm_params
non_default_params = {
diff --git a/litellm/proxy/proxy_cli.py b/litellm/proxy/proxy_cli.py
index f7eba02ecb..367bbbb700 100644
--- a/litellm/proxy/proxy_cli.py
+++ b/litellm/proxy/proxy_cli.py
@@ -16,6 +16,13 @@ from importlib import resources
import shutil
telemetry = None
+default_num_workers = 1
+try:
+ default_num_workers = os.cpu_count() or 1
+ if default_num_workers is not None and default_num_workers > 0:
+ default_num_workers -= 1
+except:
+ pass
def append_query_params(url, params):
@@ -57,7 +64,7 @@ def is_port_in_use(port):
@click.option("--port", default=8000, help="Port to bind the server to.", envvar="PORT")
@click.option(
"--num_workers",
- default=1,
+ default=default_num_workers,
help="Number of gunicorn workers to spin up",
envvar="NUM_WORKERS",
)
diff --git a/litellm/proxy/proxy_config.yaml b/litellm/proxy/proxy_config.yaml
index 654a50b2f4..76c9ed04cd 100644
--- a/litellm/proxy/proxy_config.yaml
+++ b/litellm/proxy/proxy_config.yaml
@@ -5,12 +5,9 @@ model_list:
api_base: os.environ/AZURE_API_BASE
api_key: os.environ/AZURE_API_KEY
api_version: "2023-07-01-preview"
- - model_name: azure-gpt-3.5
- litellm_params:
- model: gpt-3.5-turbo
- api_key: os.environ/OPENAI_API_KEY
- model_info:
- access_groups: ["public"]
+litellm_settings:
+ set_verbose: True
+ success_callback: ["langfuse"]
router_settings:
set_verbose: True
debug_level: "DEBUG"
\ No newline at end of file
diff --git a/litellm/proxy/proxy_load_test/litellm_proxy_config.yaml b/litellm/proxy/proxy_load_test/litellm_proxy_config.yaml
new file mode 100644
index 0000000000..2e107d3668
--- /dev/null
+++ b/litellm/proxy/proxy_load_test/litellm_proxy_config.yaml
@@ -0,0 +1,6 @@
+model_list:
+ - model_name: gpt-3.5-turbo
+ litellm_params:
+ model: openai/my-fake-model
+ api_key: my-fake-key
+ api_base: http://0.0.0.0:8090
\ No newline at end of file
diff --git a/litellm/proxy/proxy_load_test/locustfile.py b/litellm/proxy/proxy_load_test/locustfile.py
new file mode 100644
index 0000000000..2cd2e2fcce
--- /dev/null
+++ b/litellm/proxy/proxy_load_test/locustfile.py
@@ -0,0 +1,27 @@
+from locust import HttpUser, task, between
+
+
+class MyUser(HttpUser):
+ wait_time = between(1, 5)
+
+ @task
+ def chat_completion(self):
+ headers = {
+ "Content-Type": "application/json",
+ # Include any additional headers you may need for authentication, etc.
+ }
+
+ # Customize the payload with "model" and "messages" keys
+ payload = {
+ "model": "gpt-3.5-turbo",
+ "messages": [
+ {"role": "system", "content": "You are a chat bot."},
+ {"role": "user", "content": "Hello, how are you?"},
+ ],
+ # Add more data as necessary
+ }
+
+ # Make a POST request to the "chat/completions" endpoint
+ response = self.client.post("chat/completions", json=payload, headers=headers)
+
+ # Print or log the response if needed
diff --git a/litellm/proxy/proxy_load_test/openai_endpoint.py b/litellm/proxy/proxy_load_test/openai_endpoint.py
new file mode 100644
index 0000000000..b3291ce709
--- /dev/null
+++ b/litellm/proxy/proxy_load_test/openai_endpoint.py
@@ -0,0 +1,50 @@
+# import sys, os
+# sys.path.insert(
+# 0, os.path.abspath("../")
+# ) # Adds the parent directory to the system path
+from fastapi import FastAPI, Request, status, HTTPException, Depends
+from fastapi.responses import StreamingResponse
+from fastapi.security import OAuth2PasswordBearer
+from fastapi.middleware.cors import CORSMiddleware
+
+app = FastAPI()
+
+app.add_middleware(
+ CORSMiddleware,
+ allow_origins=["*"],
+ allow_credentials=True,
+ allow_methods=["*"],
+ allow_headers=["*"],
+)
+
+
+# for completion
+@app.post("/chat/completions")
+@app.post("/v1/chat/completions")
+async def completion(request: Request):
+ return {
+ "id": "chatcmpl-123",
+ "object": "chat.completion",
+ "created": 1677652288,
+ "model": "gpt-3.5-turbo-0125",
+ "system_fingerprint": "fp_44709d6fcb",
+ "choices": [
+ {
+ "index": 0,
+ "message": {
+ "role": "assistant",
+ "content": "\n\nHello there, how may I assist you today?",
+ },
+ "logprobs": None,
+ "finish_reason": "stop",
+ }
+ ],
+ "usage": {"prompt_tokens": 9, "completion_tokens": 12, "total_tokens": 21},
+ }
+
+
+if __name__ == "__main__":
+ import uvicorn
+
+ # run this on 8090, 8091, 8092 and 8093
+ uvicorn.run(app, host="0.0.0.0", port=8090)
diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py
index 7a04c8e7d2..8b1db959c4 100644
--- a/litellm/proxy/proxy_server.py
+++ b/litellm/proxy/proxy_server.py
@@ -1677,9 +1677,9 @@ class ProxyConfig:
# these are litellm callbacks - "langfuse", "sentry", "wandb"
else:
litellm.success_callback.append(callback)
- verbose_proxy_logger.debug(
+ print( # noqa
f"{blue_color_code} Initialized Success Callbacks - {litellm.success_callback} {reset_color_code}"
- )
+ ) # noqa
elif key == "failure_callback":
litellm.failure_callback = []
@@ -2672,6 +2672,11 @@ async def chat_completion(
except:
data = json.loads(body_str)
+ # Azure OpenAI only: check if user passed api-version
+ query_params = dict(request.query_params)
+ if "api-version" in query_params:
+ data["api_version"] = query_params["api-version"]
+
# Include original request and headers in the data
data["proxy_server_request"] = {
"url": str(request.url),
diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py
index 4db664dde3..e54617bd95 100644
--- a/litellm/tests/test_completion.py
+++ b/litellm/tests/test_completion.py
@@ -83,12 +83,13 @@ def test_completion_claude():
def test_completion_claude_3_empty_response():
+ litellm.set_verbose = True
messages = [
{
"role": "system",
"content": "You are 2twNLGfqk4GMOn3ffp4p.",
},
- {"role": "user", "content": "Hi gm!"},
+ {"role": "user", "content": "Hi gm!", "name": "ishaan"},
{"role": "assistant", "content": "Good morning! How are you doing today?"},
{
"role": "user",
diff --git a/litellm/tests/test_streaming.py b/litellm/tests/test_streaming.py
index c513447b02..2896b4a711 100644
--- a/litellm/tests/test_streaming.py
+++ b/litellm/tests/test_streaming.py
@@ -511,7 +511,7 @@ def test_completion_mistral_api_stream():
def test_completion_deep_infra_stream():
- # deep infra currently includes role in the 2nd chunk
+ # deep infra,currently includes role in the 2nd chunk
# waiting for them to make a fix on this
litellm.set_verbose = True
try:
diff --git a/litellm/utils.py b/litellm/utils.py
index 3285f3a08a..7466bd5c69 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -981,6 +981,7 @@ class Logging:
curl_command = self.model_call_details
# only print verbose if verbose logger is not set
+
if verbose_logger.level == 0:
# this means verbose logger was not switched on - user is in litellm.set_verbose=True
print_verbose(f"\033[92m{curl_command}\033[0m\n")
@@ -1312,6 +1313,15 @@ class Logging:
for callback in callbacks:
try:
+ litellm_params = self.model_call_details.get("litellm_params", {})
+ if litellm_params.get("no-log", False) == True:
+ # proxy cost tracking cal backs should run
+ if not (
+ isinstance(callback, CustomLogger)
+ and "_PROXY_" in callback.__class__.__name__
+ ):
+ print_verbose("no-log request, skipping logging")
+ continue
if callback == "lite_debugger":
print_verbose("reaches lite_debugger for logging!")
print_verbose(f"liteDebuggerClient: {liteDebuggerClient}")
@@ -1740,7 +1750,20 @@ class Logging:
callbacks = litellm._async_success_callback
verbose_logger.debug(f"Async success callbacks: {callbacks}")
for callback in callbacks:
+ # check if callback can run for this request
+ litellm_params = self.model_call_details.get("litellm_params", {})
+ if litellm_params.get("no-log", False) == True:
+ # proxy cost tracking cal backs should run
+ if not (
+ isinstance(callback, CustomLogger)
+ and "_PROXY_" in callback.__class__.__name__
+ ):
+ print_verbose("no-log request, skipping logging")
+ continue
try:
+ if kwargs.get("no-log", False) == True:
+ print_verbose("no-log request, skipping logging")
+ continue
if callback == "cache" and litellm.cache is not None:
# set_cache once complete streaming response is built
print_verbose("async success_callback: reaches cache for logging!")
@@ -3026,11 +3049,13 @@ def client(original_function):
print_verbose(
f"Async Wrapper: Completed Call, calling async_success_handler: {logging_obj.async_success_handler}"
)
+ # check if user does not want this to be logged
asyncio.create_task(
logging_obj.async_success_handler(result, start_time, end_time)
)
threading.Thread(
- target=logging_obj.success_handler, args=(result, start_time, end_time)
+ target=logging_obj.success_handler,
+ args=(result, start_time, end_time),
).start()
# RETURN RESULT
@@ -3933,6 +3958,7 @@ def get_litellm_params(
proxy_server_request=None,
acompletion=None,
preset_cache_key=None,
+ no_log=None,
):
litellm_params = {
"acompletion": acompletion,
@@ -3949,6 +3975,7 @@ def get_litellm_params(
"model_info": model_info,
"proxy_server_request": proxy_server_request,
"preset_cache_key": preset_cache_key,
+ "no-log": no_log,
"stream_response": {}, # litellm_call_id: ModelResponse Dict
}
@@ -4269,15 +4296,9 @@ def get_optional_params(
## raise exception if provider doesn't support passed in param
if custom_llm_provider == "anthropic":
## check if unsupported param passed in
- supported_params = [
- "stream",
- "stop",
- "temperature",
- "top_p",
- "max_tokens",
- "tools",
- "tool_choice",
- ]
+ supported_params = get_supported_openai_params(
+ model=model, custom_llm_provider=custom_llm_provider
+ )
_check_valid_arg(supported_params=supported_params)
# handle anthropic params
if stream:
@@ -4301,17 +4322,9 @@ def get_optional_params(
optional_params["tools"] = tools
elif custom_llm_provider == "cohere":
## check if unsupported param passed in
- supported_params = [
- "stream",
- "temperature",
- "max_tokens",
- "logit_bias",
- "top_p",
- "frequency_penalty",
- "presence_penalty",
- "stop",
- "n",
- ]
+ supported_params = get_supported_openai_params(
+ model=model, custom_llm_provider=custom_llm_provider
+ )
_check_valid_arg(supported_params=supported_params)
# handle cohere params
if stream:
@@ -4334,14 +4347,9 @@ def get_optional_params(
optional_params["stop_sequences"] = stop
elif custom_llm_provider == "maritalk":
## check if unsupported param passed in
- supported_params = [
- "stream",
- "temperature",
- "max_tokens",
- "top_p",
- "presence_penalty",
- "stop",
- ]
+ supported_params = get_supported_openai_params(
+ model=model, custom_llm_provider=custom_llm_provider
+ )
_check_valid_arg(supported_params=supported_params)
# handle cohere params
if stream:
@@ -4360,14 +4368,9 @@ def get_optional_params(
optional_params["stopping_tokens"] = stop
elif custom_llm_provider == "replicate":
## check if unsupported param passed in
- supported_params = [
- "stream",
- "temperature",
- "max_tokens",
- "top_p",
- "stop",
- "seed",
- ]
+ supported_params = get_supported_openai_params(
+ model=model, custom_llm_provider=custom_llm_provider
+ )
_check_valid_arg(supported_params=supported_params)
if stream:
@@ -4388,7 +4391,9 @@ def get_optional_params(
optional_params["stop_sequences"] = stop
elif custom_llm_provider == "huggingface":
## check if unsupported param passed in
- supported_params = ["stream", "temperature", "max_tokens", "top_p", "stop", "n"]
+ supported_params = get_supported_openai_params(
+ model=model, custom_llm_provider=custom_llm_provider
+ )
_check_valid_arg(supported_params=supported_params)
# temperature, top_p, n, stream, stop, max_tokens, n, presence_penalty default to None
if temperature is not None:
@@ -4427,16 +4432,9 @@ def get_optional_params(
) # since we handle translating echo, we should not send it to TGI request
elif custom_llm_provider == "together_ai":
## check if unsupported param passed in
- supported_params = [
- "stream",
- "temperature",
- "max_tokens",
- "top_p",
- "stop",
- "frequency_penalty",
- "tools",
- "tool_choice",
- ]
+ supported_params = get_supported_openai_params(
+ model=model, custom_llm_provider=custom_llm_provider
+ )
_check_valid_arg(supported_params=supported_params)
if stream:
@@ -4457,16 +4455,9 @@ def get_optional_params(
optional_params["tool_choice"] = tool_choice
elif custom_llm_provider == "ai21":
## check if unsupported param passed in
- supported_params = [
- "stream",
- "n",
- "temperature",
- "max_tokens",
- "top_p",
- "stop",
- "frequency_penalty",
- "presence_penalty",
- ]
+ supported_params = get_supported_openai_params(
+ model=model, custom_llm_provider=custom_llm_provider
+ )
_check_valid_arg(supported_params=supported_params)
if stream:
@@ -4489,7 +4480,9 @@ def get_optional_params(
custom_llm_provider == "palm" or custom_llm_provider == "gemini"
): # https://developers.generativeai.google/tutorials/curl_quickstart
## check if unsupported param passed in
- supported_params = ["temperature", "top_p", "stream", "n", "stop", "max_tokens"]
+ supported_params = get_supported_openai_params(
+ model=model, custom_llm_provider=custom_llm_provider
+ )
_check_valid_arg(supported_params=supported_params)
if temperature is not None:
@@ -4518,14 +4511,9 @@ def get_optional_params(
):
print_verbose(f"(start) INSIDE THE VERTEX AI OPTIONAL PARAM BLOCK")
## check if unsupported param passed in
- supported_params = [
- "temperature",
- "top_p",
- "max_tokens",
- "stream",
- "tools",
- "tool_choice",
- ]
+ supported_params = get_supported_openai_params(
+ model=model, custom_llm_provider=custom_llm_provider
+ )
_check_valid_arg(supported_params=supported_params)
if temperature is not None:
@@ -4555,7 +4543,9 @@ def get_optional_params(
)
elif custom_llm_provider == "sagemaker":
## check if unsupported param passed in
- supported_params = ["stream", "temperature", "max_tokens", "top_p", "stop", "n"]
+ supported_params = get_supported_openai_params(
+ model=model, custom_llm_provider=custom_llm_provider
+ )
_check_valid_arg(supported_params=supported_params)
# temperature, top_p, n, stream, stop, max_tokens, n, presence_penalty default to None
if temperature is not None:
@@ -4582,8 +4572,10 @@ def get_optional_params(
max_tokens = 1
optional_params["max_new_tokens"] = max_tokens
elif custom_llm_provider == "bedrock":
+ supported_params = get_supported_openai_params(
+ model=model, custom_llm_provider=custom_llm_provider
+ )
if "ai21" in model:
- supported_params = ["max_tokens", "temperature", "top_p", "stream"]
_check_valid_arg(supported_params=supported_params)
# params "maxTokens":200,"temperature":0,"topP":250,"stop_sequences":[],
# https://us-west-2.console.aws.amazon.com/bedrock/home?region=us-west-2#/providers?model=j2-ultra
@@ -4596,9 +4588,6 @@ def get_optional_params(
if stream:
optional_params["stream"] = stream
elif "anthropic" in model:
- supported_params = get_mapped_model_params(
- model=model, custom_llm_provider=custom_llm_provider
- )
_check_valid_arg(supported_params=supported_params)
# anthropic params on bedrock
# \"max_tokens_to_sample\":300,\"temperature\":0.5,\"top_p\":1,\"stop_sequences\":[\"\\\\n\\\\nHuman:\"]}"
@@ -4615,7 +4604,6 @@ def get_optional_params(
optional_params=optional_params,
)
elif "amazon" in model: # amazon titan llms
- supported_params = ["max_tokens", "temperature", "stop", "top_p", "stream"]
_check_valid_arg(supported_params=supported_params)
# see https://us-west-2.console.aws.amazon.com/bedrock/home?region=us-west-2#/providers?model=titan-large
if max_tokens is not None:
@@ -4632,7 +4620,6 @@ def get_optional_params(
if stream:
optional_params["stream"] = stream
elif "meta" in model: # amazon / meta llms
- supported_params = ["max_tokens", "temperature", "top_p", "stream"]
_check_valid_arg(supported_params=supported_params)
# see https://us-west-2.console.aws.amazon.com/bedrock/home?region=us-west-2#/providers?model=titan-large
if max_tokens is not None:
@@ -4644,7 +4631,6 @@ def get_optional_params(
if stream:
optional_params["stream"] = stream
elif "cohere" in model: # cohere models on bedrock
- supported_params = ["stream", "temperature", "max_tokens"]
_check_valid_arg(supported_params=supported_params)
# handle cohere params
if stream:
@@ -4654,7 +4640,6 @@ def get_optional_params(
if max_tokens is not None:
optional_params["max_tokens"] = max_tokens
elif "mistral" in model:
- supported_params = ["max_tokens", "temperature", "stop", "top_p", "stream"]
_check_valid_arg(supported_params=supported_params)
# mistral params on bedrock
# \"max_tokens\":400,\"temperature\":0.7,\"top_p\":0.7,\"stop\":[\"\\\\n\\\\nHuman:\"]}"
@@ -4698,7 +4683,9 @@ def get_optional_params(
optional_params["stop_sequences"] = stop
elif custom_llm_provider == "cloudflare":
# https://developers.cloudflare.com/workers-ai/models/text-generation/#input
- supported_params = ["max_tokens", "stream"]
+ supported_params = get_supported_openai_params(
+ model=model, custom_llm_provider=custom_llm_provider
+ )
_check_valid_arg(supported_params=supported_params)
if max_tokens is not None:
@@ -4706,14 +4693,9 @@ def get_optional_params(
if stream is not None:
optional_params["stream"] = stream
elif custom_llm_provider == "ollama":
- supported_params = [
- "max_tokens",
- "stream",
- "top_p",
- "temperature",
- "frequency_penalty",
- "stop",
- ]
+ supported_params = get_supported_openai_params(
+ model=model, custom_llm_provider=custom_llm_provider
+ )
_check_valid_arg(supported_params=supported_params)
if max_tokens is not None:
@@ -4737,16 +4719,9 @@ def get_optional_params(
non_default_params=non_default_params, optional_params=optional_params
)
elif custom_llm_provider == "nlp_cloud":
- supported_params = [
- "max_tokens",
- "stream",
- "temperature",
- "top_p",
- "presence_penalty",
- "frequency_penalty",
- "n",
- "stop",
- ]
+ supported_params = get_supported_openai_params(
+ model=model, custom_llm_provider=custom_llm_provider
+ )
_check_valid_arg(supported_params=supported_params)
if max_tokens is not None:
@@ -4766,7 +4741,9 @@ def get_optional_params(
if stop is not None:
optional_params["stop_sequences"] = stop
elif custom_llm_provider == "petals":
- supported_params = ["max_tokens", "temperature", "top_p", "stream"]
+ supported_params = get_supported_openai_params(
+ model=model, custom_llm_provider=custom_llm_provider
+ )
_check_valid_arg(supported_params=supported_params)
# max_new_tokens=1,temperature=0.9, top_p=0.6
if max_tokens is not None:
@@ -4778,18 +4755,9 @@ def get_optional_params(
if stream:
optional_params["stream"] = stream
elif custom_llm_provider == "deepinfra":
- supported_params = [
- "temperature",
- "top_p",
- "n",
- "stream",
- "stop",
- "max_tokens",
- "presence_penalty",
- "frequency_penalty",
- "logit_bias",
- "user",
- ]
+ supported_params = get_supported_openai_params(
+ model=model, custom_llm_provider=custom_llm_provider
+ )
_check_valid_arg(supported_params=supported_params)
if temperature is not None:
if (
@@ -4816,14 +4784,9 @@ def get_optional_params(
if user:
optional_params["user"] = user
elif custom_llm_provider == "perplexity":
- supported_params = [
- "temperature",
- "top_p",
- "stream",
- "max_tokens",
- "presence_penalty",
- "frequency_penalty",
- ]
+ supported_params = get_supported_openai_params(
+ model=model, custom_llm_provider=custom_llm_provider
+ )
_check_valid_arg(supported_params=supported_params)
if temperature is not None:
if (
@@ -4842,15 +4805,9 @@ def get_optional_params(
if frequency_penalty:
optional_params["frequency_penalty"] = frequency_penalty
elif custom_llm_provider == "anyscale":
- supported_params = [
- "temperature",
- "top_p",
- "stream",
- "max_tokens",
- "stop",
- "frequency_penalty",
- "presence_penalty",
- ]
+ supported_params = get_supported_openai_params(
+ model=model, custom_llm_provider=custom_llm_provider
+ )
if model in [
"mistralai/Mistral-7B-Instruct-v0.1",
"mistralai/Mixtral-8x7B-Instruct-v0.1",
@@ -4878,14 +4835,9 @@ def get_optional_params(
if max_tokens:
optional_params["max_tokens"] = max_tokens
elif custom_llm_provider == "mistral":
- supported_params = [
- "temperature",
- "top_p",
- "stream",
- "max_tokens",
- "tools",
- "tool_choice",
- ]
+ supported_params = get_supported_openai_params(
+ model=model, custom_llm_provider=custom_llm_provider
+ )
_check_valid_arg(supported_params=supported_params)
if temperature is not None:
optional_params["temperature"] = temperature
@@ -4912,25 +4864,9 @@ def get_optional_params(
extra_body # openai client supports `extra_body` param
)
elif custom_llm_provider == "openrouter":
- supported_params = [
- "functions",
- "function_call",
- "temperature",
- "top_p",
- "n",
- "stream",
- "stop",
- "max_tokens",
- "presence_penalty",
- "frequency_penalty",
- "logit_bias",
- "user",
- "response_format",
- "seed",
- "tools",
- "tool_choice",
- "max_retries",
- ]
+ supported_params = get_supported_openai_params(
+ model=model, custom_llm_provider=custom_llm_provider
+ )
_check_valid_arg(supported_params=supported_params)
if functions is not None:
@@ -4984,28 +4920,9 @@ def get_optional_params(
)
else: # assume passing in params for openai/azure openai
print_verbose(f"UNMAPPED PROVIDER, ASSUMING IT'S OPENAI/AZURE")
- supported_params = [
- "functions",
- "function_call",
- "temperature",
- "top_p",
- "n",
- "stream",
- "stop",
- "max_tokens",
- "presence_penalty",
- "frequency_penalty",
- "logit_bias",
- "user",
- "response_format",
- "seed",
- "tools",
- "tool_choice",
- "max_retries",
- "logprobs",
- "top_logprobs",
- "extra_headers",
- ]
+ supported_params = get_supported_openai_params(
+ model=model, custom_llm_provider="openai"
+ )
_check_valid_arg(supported_params=supported_params)
if functions is not None:
optional_params["functions"] = functions
@@ -5063,15 +4980,228 @@ def get_optional_params(
return optional_params
-def get_mapped_model_params(model: str, custom_llm_provider: str):
+def get_supported_openai_params(model: str, custom_llm_provider: str):
"""
Returns the supported openai params for a given model + provider
+
+ Example:
+ ```
+ get_supported_openai_params(model="anthropic.claude-3", custom_llm_provider="bedrock")
+ ```
"""
if custom_llm_provider == "bedrock":
if model.startswith("anthropic.claude-3"):
return litellm.AmazonAnthropicClaude3Config().get_supported_openai_params()
- else:
+ elif model.startswith("anthropic"):
return litellm.AmazonAnthropicConfig().get_supported_openai_params()
+ elif model.startswith("ai21"):
+ return ["max_tokens", "temperature", "top_p", "stream"]
+ elif model.startswith("amazon"):
+ return ["max_tokens", "temperature", "stop", "top_p", "stream"]
+ elif model.startswith("meta"):
+ return ["max_tokens", "temperature", "top_p", "stream"]
+ elif model.startswith("cohere"):
+ return ["stream", "temperature", "max_tokens"]
+ elif model.startswith("mistral"):
+ return ["max_tokens", "temperature", "stop", "top_p", "stream"]
+ elif custom_llm_provider == "ollama_chat":
+ return litellm.OllamaChatConfig().get_supported_openai_params()
+ elif custom_llm_provider == "anthropic":
+ return [
+ "stream",
+ "stop",
+ "temperature",
+ "top_p",
+ "max_tokens",
+ "tools",
+ "tool_choice",
+ ]
+ elif custom_llm_provider == "cohere":
+ return [
+ "stream",
+ "temperature",
+ "max_tokens",
+ "logit_bias",
+ "top_p",
+ "frequency_penalty",
+ "presence_penalty",
+ "stop",
+ "n",
+ ]
+ elif custom_llm_provider == "maritalk":
+ return [
+ "stream",
+ "temperature",
+ "max_tokens",
+ "top_p",
+ "presence_penalty",
+ "stop",
+ ]
+ elif custom_llm_provider == "openai" or custom_llm_provider == "azure":
+ return [
+ "functions",
+ "function_call",
+ "temperature",
+ "top_p",
+ "n",
+ "stream",
+ "stop",
+ "max_tokens",
+ "presence_penalty",
+ "frequency_penalty",
+ "logit_bias",
+ "user",
+ "response_format",
+ "seed",
+ "tools",
+ "tool_choice",
+ "max_retries",
+ "logprobs",
+ "top_logprobs",
+ "extra_headers",
+ ]
+ elif custom_llm_provider == "openrouter":
+ return [
+ "functions",
+ "function_call",
+ "temperature",
+ "top_p",
+ "n",
+ "stream",
+ "stop",
+ "max_tokens",
+ "presence_penalty",
+ "frequency_penalty",
+ "logit_bias",
+ "user",
+ "response_format",
+ "seed",
+ "tools",
+ "tool_choice",
+ "max_retries",
+ ]
+ elif custom_llm_provider == "mistral":
+ return [
+ "temperature",
+ "top_p",
+ "stream",
+ "max_tokens",
+ "tools",
+ "tool_choice",
+ ]
+ elif custom_llm_provider == "replicate":
+ return [
+ "stream",
+ "temperature",
+ "max_tokens",
+ "top_p",
+ "stop",
+ "seed",
+ ]
+ elif custom_llm_provider == "huggingface":
+ return ["stream", "temperature", "max_tokens", "top_p", "stop", "n"]
+ elif custom_llm_provider == "together_ai":
+ return [
+ "stream",
+ "temperature",
+ "max_tokens",
+ "top_p",
+ "stop",
+ "frequency_penalty",
+ "tools",
+ "tool_choice",
+ ]
+ elif custom_llm_provider == "ai21":
+ return [
+ "stream",
+ "n",
+ "temperature",
+ "max_tokens",
+ "top_p",
+ "stop",
+ "frequency_penalty",
+ "presence_penalty",
+ ]
+ elif custom_llm_provider == "palm" or custom_llm_provider == "gemini":
+ return ["temperature", "top_p", "stream", "n", "stop", "max_tokens"]
+ elif custom_llm_provider == "vertex_ai":
+ return [
+ "temperature",
+ "top_p",
+ "max_tokens",
+ "stream",
+ "tools",
+ "tool_choice",
+ ]
+ elif custom_llm_provider == "sagemaker":
+ return ["stream", "temperature", "max_tokens", "top_p", "stop", "n"]
+ elif custom_llm_provider == "aleph_alpha":
+ return [
+ "max_tokens",
+ "stream",
+ "top_p",
+ "temperature",
+ "presence_penalty",
+ "frequency_penalty",
+ "n",
+ "stop",
+ ]
+ elif custom_llm_provider == "cloudflare":
+ return ["max_tokens", "stream"]
+ elif custom_llm_provider == "ollama":
+ return [
+ "max_tokens",
+ "stream",
+ "top_p",
+ "temperature",
+ "frequency_penalty",
+ "stop",
+ ]
+ elif custom_llm_provider == "nlp_cloud":
+ return [
+ "max_tokens",
+ "stream",
+ "temperature",
+ "top_p",
+ "presence_penalty",
+ "frequency_penalty",
+ "n",
+ "stop",
+ ]
+ elif custom_llm_provider == "petals":
+ return ["max_tokens", "temperature", "top_p", "stream"]
+ elif custom_llm_provider == "deepinfra":
+ return [
+ "temperature",
+ "top_p",
+ "n",
+ "stream",
+ "stop",
+ "max_tokens",
+ "presence_penalty",
+ "frequency_penalty",
+ "logit_bias",
+ "user",
+ ]
+ elif custom_llm_provider == "perplexity":
+ return [
+ "temperature",
+ "top_p",
+ "stream",
+ "max_tokens",
+ "presence_penalty",
+ "frequency_penalty",
+ ]
+ elif custom_llm_provider == "anyscale":
+ return [
+ "temperature",
+ "top_p",
+ "stream",
+ "max_tokens",
+ "stop",
+ "frequency_penalty",
+ "presence_penalty",
+ ]
def get_llm_provider(
diff --git a/pyproject.toml b/pyproject.toml
index 80f5c822c1..4c00be9102 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
[tool.poetry]
name = "litellm"
-version = "1.30.3"
+version = "1.30.4"
description = "Library to easily interface with LLM API providers"
authors = ["BerriAI"]
license = "MIT"
@@ -74,7 +74,7 @@ requires = ["poetry-core", "wheel"]
build-backend = "poetry.core.masonry.api"
[tool.commitizen]
-version = "1.30.3"
+version = "1.30.4"
version_files = [
"pyproject.toml:^version"
]
diff --git a/tests/test_whisper.py b/tests/test_whisper.py
index 5cb6519518..54ecfbf50c 100644
--- a/tests/test_whisper.py
+++ b/tests/test_whisper.py
@@ -9,6 +9,7 @@ import sys, os, dotenv
from typing import Optional
from dotenv import load_dotenv
+# Get the current directory of the file being run
pwd = os.path.dirname(os.path.realpath(__file__))
print(pwd)
@@ -37,12 +38,13 @@ def test_transcription():
def test_transcription_azure():
+ litellm.set_verbose = True
transcript = litellm.transcription(
model="azure/azure-whisper",
file=audio_file,
- api_base=os.getenv("AZURE_EUROPE_API_BASE"),
+ api_base="https://my-endpoint-europe-berri-992.openai.azure.com/",
api_key=os.getenv("AZURE_EUROPE_API_KEY"),
- api_version=os.getenv("2024-02-15-preview"),
+ api_version="2024-02-15-preview",
)
assert transcript.text is not None
@@ -57,9 +59,9 @@ async def test_transcription_async_azure():
transcript = await litellm.atranscription(
model="azure/azure-whisper",
file=audio_file,
- api_base=os.getenv("AZURE_EUROPE_API_BASE"),
+ api_base="https://my-endpoint-europe-berri-992.openai.azure.com/",
api_key=os.getenv("AZURE_EUROPE_API_KEY"),
- api_version=os.getenv("2024-02-15-preview"),
+ api_version="2024-02-15-preview",
)
assert transcript.text is not None
@@ -96,7 +98,7 @@ async def test_transcription_on_router():
"model_name": "whisper",
"litellm_params": {
"model": "azure/azure-whisper",
- "api_base": os.getenv("AZURE_EUROPE_API_BASE"),
+ "api_base": "https://my-endpoint-europe-berri-992.openai.azure.com/",
"api_key": os.getenv("AZURE_EUROPE_API_KEY"),
"api_version": "2024-02-15-preview",
},