forked from phoenix/litellm-mirror
Merge branch 'main' into litellm_load_balancing_transcription_endpoints
This commit is contained in:
commit
caa99f43bf
22 changed files with 704 additions and 233 deletions
|
@ -1,5 +1,84 @@
|
|||
import Image from '@theme/IdealImage';
|
||||
|
||||
# 🔥 Load Test LiteLLM
|
||||
|
||||
## Load Test LiteLLM Proxy - 1500+ req/s
|
||||
|
||||
## 1500+ concurrent requests/s
|
||||
|
||||
LiteLLM proxy has been load tested to handle 1500+ concurrent req/s
|
||||
|
||||
```python
|
||||
import time, asyncio
|
||||
from openai import AsyncOpenAI, AsyncAzureOpenAI
|
||||
import uuid
|
||||
import traceback
|
||||
|
||||
# base_url - litellm proxy endpoint
|
||||
# api_key - litellm proxy api-key, is created proxy with auth
|
||||
litellm_client = AsyncOpenAI(base_url="http://0.0.0.0:4000", api_key="sk-1234")
|
||||
|
||||
|
||||
async def litellm_completion():
|
||||
# Your existing code for litellm_completion goes here
|
||||
try:
|
||||
response = await litellm_client.chat.completions.create(
|
||||
model="azure-gpt-3.5",
|
||||
messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
|
||||
)
|
||||
print(response)
|
||||
return response
|
||||
|
||||
except Exception as e:
|
||||
# If there's an exception, log the error message
|
||||
with open("error_log.txt", "a") as error_log:
|
||||
error_log.write(f"Error during completion: {str(e)}\n")
|
||||
pass
|
||||
|
||||
|
||||
async def main():
|
||||
for i in range(1):
|
||||
start = time.time()
|
||||
n = 1500 # Number of concurrent tasks
|
||||
tasks = [litellm_completion() for _ in range(n)]
|
||||
|
||||
chat_completions = await asyncio.gather(*tasks)
|
||||
|
||||
successful_completions = [c for c in chat_completions if c is not None]
|
||||
|
||||
# Write errors to error_log.txt
|
||||
with open("error_log.txt", "a") as error_log:
|
||||
for completion in chat_completions:
|
||||
if isinstance(completion, str):
|
||||
error_log.write(completion + "\n")
|
||||
|
||||
print(n, time.time() - start, len(successful_completions))
|
||||
time.sleep(10)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Blank out contents of error_log.txt
|
||||
open("error_log.txt", "w").close()
|
||||
|
||||
asyncio.run(main())
|
||||
|
||||
```
|
||||
|
||||
### Throughput - 30% Increase
|
||||
LiteLLM proxy + Load Balancer gives **30% increase** in throughput compared to Raw OpenAI API
|
||||
<Image img={require('../img/throughput.png')} />
|
||||
|
||||
### Latency Added - 0.00325 seconds
|
||||
LiteLLM proxy adds **0.00325 seconds** latency as compared to using the Raw OpenAI API
|
||||
<Image img={require('../img/latency.png')} />
|
||||
|
||||
|
||||
### Testing LiteLLM Proxy with Locust
|
||||
- 1 LiteLLM container can handle ~140 requests/second with 0.4 failures
|
||||
|
||||
<Image img={require('../img/locust.png')} />
|
||||
|
||||
## Load Test LiteLLM SDK vs OpenAI
|
||||
Here is a script to load test LiteLLM vs OpenAI
|
||||
|
||||
```python
|
||||
|
@ -84,4 +163,5 @@ async def loadtest_fn():
|
|||
# Run the event loop to execute the async function
|
||||
asyncio.run(loadtest_fn())
|
||||
|
||||
```
|
||||
```
|
||||
|
||||
|
|
|
@ -49,9 +49,9 @@ model_list:
|
|||
rpm: 6
|
||||
- model_name: anthropic-claude
|
||||
litellm_params:
|
||||
model="bedrock/anthropic.claude-instant-v1"
|
||||
model: bedrock/anthropic.claude-instant-v1
|
||||
### [OPTIONAL] SET AWS REGION ###
|
||||
aws_region_name="us-east-1"
|
||||
aws_region_name: us-east-1
|
||||
- model_name: vllm-models
|
||||
litellm_params:
|
||||
model: openai/facebook/opt-125m # the `openai/` prefix tells litellm it's openai compatible
|
||||
|
|
|
@ -68,6 +68,72 @@ CMD ["--port", "4000", "--config", "config.yaml", "--detailed_debug", "--run_gun
|
|||
|
||||
</TabItem>
|
||||
|
||||
<TabItem value="kubernetes" label="Kubernetes">
|
||||
|
||||
Deploying a config file based litellm instance just requires a simple deployment that loads
|
||||
the config.yaml file via a config map. Also it would be a good practice to use the env var
|
||||
declaration for api keys, and attach the env vars with the api key values as an opaque secret.
|
||||
|
||||
```yaml
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: litellm-config-file
|
||||
data:
|
||||
config.yaml: |
|
||||
model_list:
|
||||
- model_name: gpt-3.5-turbo
|
||||
litellm_params:
|
||||
model: azure/gpt-turbo-small-ca
|
||||
api_base: https://my-endpoint-canada-berri992.openai.azure.com/
|
||||
api_key: os.environ/CA_AZURE_OPENAI_API_KEY
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
type: Opaque
|
||||
metadata:
|
||||
name: litellm-secrets
|
||||
data:
|
||||
CA_AZURE_OPENAI_API_KEY: bWVvd19pbV9hX2NhdA== # your api key in base64
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: litellm-deployment
|
||||
labels:
|
||||
app: litellm
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app: litellm
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: litellm
|
||||
spec:
|
||||
containers:
|
||||
- name: litellm
|
||||
image: ghcr.io/berriai/litellm:main-latest # it is recommended to fix a version generally
|
||||
ports:
|
||||
- containerPort: 4000
|
||||
volumeMounts:
|
||||
- name: config-volume
|
||||
mountPath: /app/proxy_server_config.yaml
|
||||
subPath: config.yaml
|
||||
envFrom:
|
||||
- secretRef:
|
||||
name: litellm-secrets
|
||||
volumes:
|
||||
- name: config-volume
|
||||
configMap:
|
||||
name: litellm-config-file
|
||||
```
|
||||
|
||||
> [!TIP]
|
||||
> To avoid issues with predictability, difficulties in rollback, and inconsistent environments, use versioning or SHA digests (for example, `litellm:main-v1.30.3` or `litellm@sha256:12345abcdef...`) instead of `litellm:main-latest`.
|
||||
|
||||
</TabItem>
|
||||
|
||||
</Tabs>
|
||||
|
||||
## Deploy with Database
|
||||
|
@ -350,17 +416,3 @@ Run the command `docker-compose up` or `docker compose up` as per your docker in
|
|||
|
||||
|
||||
Your LiteLLM container should be running now on the defined port e.g. `8000`.
|
||||
|
||||
|
||||
|
||||
## LiteLLM Proxy Performance
|
||||
|
||||
LiteLLM proxy has been load tested to handle 1500 req/s.
|
||||
|
||||
### Throughput - 30% Increase
|
||||
LiteLLM proxy + Load Balancer gives **30% increase** in throughput compared to Raw OpenAI API
|
||||
<Image img={require('../../img/throughput.png')} />
|
||||
|
||||
### Latency Added - 0.00325 seconds
|
||||
LiteLLM proxy adds **0.00325 seconds** latency as compared to using the Raw OpenAI API
|
||||
<Image img={require('../../img/latency.png')} />
|
||||
|
|
|
@ -12,14 +12,16 @@ Features here are behind a commercial license in our `/enterprise` folder. [**Se
|
|||
:::
|
||||
|
||||
Features:
|
||||
- [ ] Content Moderation with LlamaGuard
|
||||
- [ ] Content Moderation with Google Text Moderations
|
||||
- [ ] Content Moderation with LLM Guard
|
||||
- [ ] Reject calls from Blocked User list
|
||||
- [ ] Reject calls (incoming / outgoing) with Banned Keywords (e.g. competitors)
|
||||
- [ ] Tracking Spend for Custom Tags
|
||||
- ✅ Content Moderation with LlamaGuard
|
||||
- ✅ Content Moderation with Google Text Moderations
|
||||
- ✅ Content Moderation with LLM Guard
|
||||
- ✅ Reject calls from Blocked User list
|
||||
- ✅ Reject calls (incoming / outgoing) with Banned Keywords (e.g. competitors)
|
||||
- ✅ Don't log/store specific requests (eg confidential LLM requests)
|
||||
- ✅ Tracking Spend for Custom Tags
|
||||
|
||||
## Content Moderation with LlamaGuard
|
||||
## Content Moderation
|
||||
### Content Moderation with LlamaGuard
|
||||
|
||||
Currently works with Sagemaker's LlamaGuard endpoint.
|
||||
|
||||
|
@ -39,7 +41,7 @@ os.environ["AWS_SECRET_ACCESS_KEY"] = ""
|
|||
os.environ["AWS_REGION_NAME"] = ""
|
||||
```
|
||||
|
||||
### Customize LlamaGuard prompt
|
||||
#### Customize LlamaGuard prompt
|
||||
|
||||
To modify the unsafe categories llama guard evaluates against, just create your own version of [this category list](https://github.com/BerriAI/litellm/blob/main/litellm/proxy/llamaguard_prompt.txt)
|
||||
|
||||
|
@ -51,7 +53,7 @@ callbacks: ["llamaguard_moderations"]
|
|||
llamaguard_unsafe_content_categories: /path/to/llamaguard_prompt.txt
|
||||
```
|
||||
|
||||
## Content Moderation with LLM Guard
|
||||
### Content Moderation with LLM Guard
|
||||
|
||||
Set the LLM Guard API Base in your environment
|
||||
|
||||
|
@ -78,7 +80,7 @@ Expected results:
|
|||
LLM Guard: Received response - {"sanitized_prompt": "hello world", "is_valid": true, "scanners": { "Regex": 0.0 }}
|
||||
```
|
||||
|
||||
## Content Moderation with Google Text Moderation
|
||||
### Content Moderation with Google Text Moderation
|
||||
|
||||
Requires your GOOGLE_APPLICATION_CREDENTIALS to be set in your .env (same as VertexAI).
|
||||
|
||||
|
@ -89,7 +91,7 @@ litellm_settings:
|
|||
callbacks: ["google_text_moderation"]
|
||||
```
|
||||
|
||||
### Set custom confidence thresholds
|
||||
#### Set custom confidence thresholds
|
||||
|
||||
Google Moderations checks the test against several categories. [Source](https://cloud.google.com/natural-language/docs/moderating-text#safety_attribute_confidence_scores)
|
||||
|
||||
|
@ -133,6 +135,33 @@ Here are the category specific values:
|
|||
| "legal" | legal_threshold: 0.1 |
|
||||
|
||||
|
||||
## Incognito Requests - Don't log anything
|
||||
|
||||
When `no-log=True`, the request will **not be logged on any callbacks** and there will be **no server logs on litellm**
|
||||
|
||||
```python
|
||||
import openai
|
||||
client = openai.OpenAI(
|
||||
api_key="anything", # proxy api-key
|
||||
base_url="http://0.0.0.0:8000" # litellm proxy
|
||||
)
|
||||
|
||||
response = client.chat.completions.create(
|
||||
model="gpt-3.5-turbo",
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "this is a test request, write a short poem"
|
||||
}
|
||||
],
|
||||
extra_body={
|
||||
"no-log": True
|
||||
}
|
||||
)
|
||||
|
||||
print(response)
|
||||
```
|
||||
|
||||
|
||||
## Enable Blocked User Lists
|
||||
If any call is made to proxy with this user id, it'll be rejected - use this if you want to let users opt-out of ai features
|
||||
|
|
BIN
docs/my-website/img/locust.png
Normal file
BIN
docs/my-website/img/locust.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 109 KiB |
|
@ -570,7 +570,7 @@ from .utils import (
|
|||
_calculate_retry_after,
|
||||
_should_retry,
|
||||
get_secret,
|
||||
get_mapped_model_params,
|
||||
get_supported_openai_params,
|
||||
)
|
||||
from .llms.huggingface_restapi import HuggingfaceConfig
|
||||
from .llms.anthropic import AnthropicConfig
|
||||
|
|
|
@ -31,6 +31,18 @@ def _turn_on_debug():
|
|||
verbose_proxy_logger.setLevel(level=logging.DEBUG) # set proxy logs to debug
|
||||
|
||||
|
||||
def _disable_debugging():
|
||||
verbose_logger.disabled = True
|
||||
verbose_router_logger.disabled = True
|
||||
verbose_proxy_logger.disabled = True
|
||||
|
||||
|
||||
def _enable_debugging():
|
||||
verbose_logger.disabled = False
|
||||
verbose_router_logger.disabled = False
|
||||
verbose_proxy_logger.disabled = False
|
||||
|
||||
|
||||
def print_verbose(print_statement):
|
||||
try:
|
||||
if set_verbose:
|
||||
|
|
|
@ -15,6 +15,7 @@ import litellm, json
|
|||
import httpx
|
||||
from .custom_httpx.azure_dall_e_2 import CustomHTTPTransport, AsyncCustomHTTPTransport
|
||||
from openai import AzureOpenAI, AsyncAzureOpenAI
|
||||
import uuid
|
||||
|
||||
|
||||
class AzureOpenAIError(Exception):
|
||||
|
@ -271,6 +272,14 @@ class AzureChatCompletion(BaseLLM):
|
|||
azure_client = AzureOpenAI(**azure_client_params)
|
||||
else:
|
||||
azure_client = client
|
||||
if api_version is not None and isinstance(
|
||||
azure_client._custom_query, dict
|
||||
):
|
||||
# set api_version to version passed by user
|
||||
azure_client._custom_query.setdefault(
|
||||
"api-version", api_version
|
||||
)
|
||||
|
||||
response = azure_client.chat.completions.create(**data, timeout=timeout) # type: ignore
|
||||
stringified_response = response.model_dump()
|
||||
## LOGGING
|
||||
|
@ -334,10 +343,17 @@ class AzureChatCompletion(BaseLLM):
|
|||
azure_client_params["api_key"] = api_key
|
||||
elif azure_ad_token is not None:
|
||||
azure_client_params["azure_ad_token"] = azure_ad_token
|
||||
|
||||
# setting Azure client
|
||||
if client is None:
|
||||
azure_client = AsyncAzureOpenAI(**azure_client_params)
|
||||
else:
|
||||
azure_client = client
|
||||
if api_version is not None and isinstance(
|
||||
azure_client._custom_query, dict
|
||||
):
|
||||
# set api_version to version passed by user
|
||||
azure_client._custom_query.setdefault("api-version", api_version)
|
||||
## LOGGING
|
||||
logging_obj.pre_call(
|
||||
input=data["messages"],
|
||||
|
@ -402,6 +418,9 @@ class AzureChatCompletion(BaseLLM):
|
|||
azure_client = AzureOpenAI(**azure_client_params)
|
||||
else:
|
||||
azure_client = client
|
||||
if api_version is not None and isinstance(azure_client._custom_query, dict):
|
||||
# set api_version to version passed by user
|
||||
azure_client._custom_query.setdefault("api-version", api_version)
|
||||
## LOGGING
|
||||
logging_obj.pre_call(
|
||||
input=data["messages"],
|
||||
|
@ -455,6 +474,11 @@ class AzureChatCompletion(BaseLLM):
|
|||
azure_client = AsyncAzureOpenAI(**azure_client_params)
|
||||
else:
|
||||
azure_client = client
|
||||
if api_version is not None and isinstance(
|
||||
azure_client._custom_query, dict
|
||||
):
|
||||
# set api_version to version passed by user
|
||||
azure_client._custom_query.setdefault("api-version", api_version)
|
||||
## LOGGING
|
||||
logging_obj.pre_call(
|
||||
input=data["messages"],
|
||||
|
@ -813,6 +837,19 @@ class AzureChatCompletion(BaseLLM):
|
|||
azure_client = AzureOpenAI(http_client=litellm.client_session, **azure_client_params) # type: ignore
|
||||
else:
|
||||
azure_client = client
|
||||
|
||||
## LOGGING
|
||||
logging_obj.pre_call(
|
||||
input=f"audio_file_{uuid.uuid4()}",
|
||||
api_key=azure_client.api_key,
|
||||
additional_args={
|
||||
"headers": {"Authorization": f"Bearer {azure_client.api_key}"},
|
||||
"api_base": azure_client._base_url._uri_reference,
|
||||
"atranscription": True,
|
||||
"complete_input_dict": data,
|
||||
},
|
||||
)
|
||||
|
||||
response = azure_client.audio.transcriptions.create(
|
||||
**data, timeout=timeout # type: ignore
|
||||
)
|
||||
|
@ -850,6 +887,20 @@ class AzureChatCompletion(BaseLLM):
|
|||
else:
|
||||
async_azure_client = client
|
||||
|
||||
## LOGGING
|
||||
logging_obj.pre_call(
|
||||
input=f"audio_file_{uuid.uuid4()}",
|
||||
api_key=async_azure_client.api_key,
|
||||
additional_args={
|
||||
"headers": {
|
||||
"Authorization": f"Bearer {async_azure_client.api_key}"
|
||||
},
|
||||
"api_base": async_azure_client._base_url._uri_reference,
|
||||
"atranscription": True,
|
||||
"complete_input_dict": data,
|
||||
},
|
||||
)
|
||||
|
||||
response = await async_azure_client.audio.transcriptions.create(
|
||||
**data, timeout=timeout
|
||||
) # type: ignore
|
||||
|
|
|
@ -238,14 +238,22 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
status_code=422, message=f"Timeout needs to be a float"
|
||||
)
|
||||
|
||||
if custom_llm_provider == "mistral":
|
||||
# check if message content passed in as list, and not string
|
||||
messages = prompt_factory(
|
||||
model=model,
|
||||
messages=messages,
|
||||
custom_llm_provider=custom_llm_provider,
|
||||
)
|
||||
|
||||
if custom_llm_provider != "openai":
|
||||
# process all OpenAI compatible provider logic here
|
||||
if custom_llm_provider == "mistral":
|
||||
# check if message content passed in as list, and not string
|
||||
messages = prompt_factory(
|
||||
model=model,
|
||||
messages=messages,
|
||||
custom_llm_provider=custom_llm_provider,
|
||||
)
|
||||
if custom_llm_provider == "perplexity" and messages is not None:
|
||||
# check if messages.name is passed + supported, if not supported remove
|
||||
messages = prompt_factory(
|
||||
model=model,
|
||||
messages=messages,
|
||||
custom_llm_provider=custom_llm_provider,
|
||||
)
|
||||
for _ in range(
|
||||
2
|
||||
): # if call fails due to alternating messages, retry with reformatted message
|
||||
|
|
|
@ -556,6 +556,7 @@ def anthropic_messages_pt(messages: list):
|
|||
3. Each message must alternate between "user" and "assistant" (this is not addressed as now by litellm)
|
||||
4. final assistant content cannot end with trailing whitespace (anthropic raises an error otherwise)
|
||||
5. System messages are a separate param to the Messages API (used for tool calling)
|
||||
6. Ensure we only accept role, content. (message.name is not supported)
|
||||
"""
|
||||
## Ensure final assistant message has no trailing whitespace
|
||||
last_assistant_message_idx: Optional[int] = None
|
||||
|
@ -583,7 +584,9 @@ def anthropic_messages_pt(messages: list):
|
|||
new_content.append({"type": "text", "text": m["text"]})
|
||||
new_messages.append({"role": messages[0]["role"], "content": new_content}) # type: ignore
|
||||
else:
|
||||
new_messages.append(messages[0])
|
||||
new_messages.append(
|
||||
{"role": messages[0]["role"], "content": messages[0]["content"]}
|
||||
)
|
||||
|
||||
return new_messages
|
||||
|
||||
|
@ -606,7 +609,9 @@ def anthropic_messages_pt(messages: list):
|
|||
new_content.append({"type": "text", "content": m["text"]})
|
||||
new_messages.append({"role": messages[i]["role"], "content": new_content}) # type: ignore
|
||||
else:
|
||||
new_messages.append(messages[i])
|
||||
new_messages.append(
|
||||
{"role": messages[i]["role"], "content": messages[i]["content"]}
|
||||
)
|
||||
|
||||
if messages[i]["role"] == messages[i + 1]["role"]:
|
||||
if messages[i]["role"] == "user":
|
||||
|
@ -897,6 +902,10 @@ def prompt_factory(
|
|||
return anthropic_pt(messages=messages)
|
||||
elif "mistral." in model:
|
||||
return mistral_instruct_pt(messages=messages)
|
||||
elif custom_llm_provider == "perplexity":
|
||||
for message in messages:
|
||||
message.pop("name", None)
|
||||
return messages
|
||||
try:
|
||||
if "meta-llama/llama-2" in model and "chat" in model:
|
||||
return llama_2_chat_pt(messages=messages)
|
||||
|
|
|
@ -488,6 +488,8 @@ def completion(
|
|||
### ASYNC CALLS ###
|
||||
acompletion = kwargs.get("acompletion", False)
|
||||
client = kwargs.get("client", None)
|
||||
### Admin Controls ###
|
||||
no_log = kwargs.get("no-log", False)
|
||||
######## end of unpacking kwargs ###########
|
||||
openai_params = [
|
||||
"functions",
|
||||
|
@ -564,6 +566,7 @@ def completion(
|
|||
"caching_groups",
|
||||
"ttl",
|
||||
"cache",
|
||||
"no-log",
|
||||
]
|
||||
default_params = openai_params + litellm_params
|
||||
non_default_params = {
|
||||
|
@ -727,6 +730,7 @@ def completion(
|
|||
model_info=model_info,
|
||||
proxy_server_request=proxy_server_request,
|
||||
preset_cache_key=preset_cache_key,
|
||||
no_log=no_log,
|
||||
)
|
||||
logging.update_environment_variables(
|
||||
model=model,
|
||||
|
@ -2418,6 +2422,7 @@ def embedding(
|
|||
"caching_groups",
|
||||
"ttl",
|
||||
"cache",
|
||||
"no-log",
|
||||
]
|
||||
default_params = openai_params + litellm_params
|
||||
non_default_params = {
|
||||
|
|
|
@ -16,6 +16,13 @@ from importlib import resources
|
|||
import shutil
|
||||
|
||||
telemetry = None
|
||||
default_num_workers = 1
|
||||
try:
|
||||
default_num_workers = os.cpu_count() or 1
|
||||
if default_num_workers is not None and default_num_workers > 0:
|
||||
default_num_workers -= 1
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
def append_query_params(url, params):
|
||||
|
@ -57,7 +64,7 @@ def is_port_in_use(port):
|
|||
@click.option("--port", default=8000, help="Port to bind the server to.", envvar="PORT")
|
||||
@click.option(
|
||||
"--num_workers",
|
||||
default=1,
|
||||
default=default_num_workers,
|
||||
help="Number of gunicorn workers to spin up",
|
||||
envvar="NUM_WORKERS",
|
||||
)
|
||||
|
|
|
@ -5,12 +5,9 @@ model_list:
|
|||
api_base: os.environ/AZURE_API_BASE
|
||||
api_key: os.environ/AZURE_API_KEY
|
||||
api_version: "2023-07-01-preview"
|
||||
- model_name: azure-gpt-3.5
|
||||
litellm_params:
|
||||
model: gpt-3.5-turbo
|
||||
api_key: os.environ/OPENAI_API_KEY
|
||||
model_info:
|
||||
access_groups: ["public"]
|
||||
litellm_settings:
|
||||
set_verbose: True
|
||||
success_callback: ["langfuse"]
|
||||
router_settings:
|
||||
set_verbose: True
|
||||
debug_level: "DEBUG"
|
6
litellm/proxy/proxy_load_test/litellm_proxy_config.yaml
Normal file
6
litellm/proxy/proxy_load_test/litellm_proxy_config.yaml
Normal file
|
@ -0,0 +1,6 @@
|
|||
model_list:
|
||||
- model_name: gpt-3.5-turbo
|
||||
litellm_params:
|
||||
model: openai/my-fake-model
|
||||
api_key: my-fake-key
|
||||
api_base: http://0.0.0.0:8090
|
27
litellm/proxy/proxy_load_test/locustfile.py
Normal file
27
litellm/proxy/proxy_load_test/locustfile.py
Normal file
|
@ -0,0 +1,27 @@
|
|||
from locust import HttpUser, task, between
|
||||
|
||||
|
||||
class MyUser(HttpUser):
|
||||
wait_time = between(1, 5)
|
||||
|
||||
@task
|
||||
def chat_completion(self):
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
# Include any additional headers you may need for authentication, etc.
|
||||
}
|
||||
|
||||
# Customize the payload with "model" and "messages" keys
|
||||
payload = {
|
||||
"model": "gpt-3.5-turbo",
|
||||
"messages": [
|
||||
{"role": "system", "content": "You are a chat bot."},
|
||||
{"role": "user", "content": "Hello, how are you?"},
|
||||
],
|
||||
# Add more data as necessary
|
||||
}
|
||||
|
||||
# Make a POST request to the "chat/completions" endpoint
|
||||
response = self.client.post("chat/completions", json=payload, headers=headers)
|
||||
|
||||
# Print or log the response if needed
|
50
litellm/proxy/proxy_load_test/openai_endpoint.py
Normal file
50
litellm/proxy/proxy_load_test/openai_endpoint.py
Normal file
|
@ -0,0 +1,50 @@
|
|||
# import sys, os
|
||||
# sys.path.insert(
|
||||
# 0, os.path.abspath("../")
|
||||
# ) # Adds the parent directory to the system path
|
||||
from fastapi import FastAPI, Request, status, HTTPException, Depends
|
||||
from fastapi.responses import StreamingResponse
|
||||
from fastapi.security import OAuth2PasswordBearer
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"],
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
|
||||
# for completion
|
||||
@app.post("/chat/completions")
|
||||
@app.post("/v1/chat/completions")
|
||||
async def completion(request: Request):
|
||||
return {
|
||||
"id": "chatcmpl-123",
|
||||
"object": "chat.completion",
|
||||
"created": 1677652288,
|
||||
"model": "gpt-3.5-turbo-0125",
|
||||
"system_fingerprint": "fp_44709d6fcb",
|
||||
"choices": [
|
||||
{
|
||||
"index": 0,
|
||||
"message": {
|
||||
"role": "assistant",
|
||||
"content": "\n\nHello there, how may I assist you today?",
|
||||
},
|
||||
"logprobs": None,
|
||||
"finish_reason": "stop",
|
||||
}
|
||||
],
|
||||
"usage": {"prompt_tokens": 9, "completion_tokens": 12, "total_tokens": 21},
|
||||
}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
|
||||
# run this on 8090, 8091, 8092 and 8093
|
||||
uvicorn.run(app, host="0.0.0.0", port=8090)
|
|
@ -1677,9 +1677,9 @@ class ProxyConfig:
|
|||
# these are litellm callbacks - "langfuse", "sentry", "wandb"
|
||||
else:
|
||||
litellm.success_callback.append(callback)
|
||||
verbose_proxy_logger.debug(
|
||||
print( # noqa
|
||||
f"{blue_color_code} Initialized Success Callbacks - {litellm.success_callback} {reset_color_code}"
|
||||
)
|
||||
) # noqa
|
||||
elif key == "failure_callback":
|
||||
litellm.failure_callback = []
|
||||
|
||||
|
@ -2672,6 +2672,11 @@ async def chat_completion(
|
|||
except:
|
||||
data = json.loads(body_str)
|
||||
|
||||
# Azure OpenAI only: check if user passed api-version
|
||||
query_params = dict(request.query_params)
|
||||
if "api-version" in query_params:
|
||||
data["api_version"] = query_params["api-version"]
|
||||
|
||||
# Include original request and headers in the data
|
||||
data["proxy_server_request"] = {
|
||||
"url": str(request.url),
|
||||
|
|
|
@ -83,12 +83,13 @@ def test_completion_claude():
|
|||
|
||||
|
||||
def test_completion_claude_3_empty_response():
|
||||
litellm.set_verbose = True
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are 2twNLGfqk4GMOn3ffp4p.",
|
||||
},
|
||||
{"role": "user", "content": "Hi gm!"},
|
||||
{"role": "user", "content": "Hi gm!", "name": "ishaan"},
|
||||
{"role": "assistant", "content": "Good morning! How are you doing today?"},
|
||||
{
|
||||
"role": "user",
|
||||
|
|
|
@ -511,7 +511,7 @@ def test_completion_mistral_api_stream():
|
|||
|
||||
|
||||
def test_completion_deep_infra_stream():
|
||||
# deep infra currently includes role in the 2nd chunk
|
||||
# deep infra,currently includes role in the 2nd chunk
|
||||
# waiting for them to make a fix on this
|
||||
litellm.set_verbose = True
|
||||
try:
|
||||
|
|
482
litellm/utils.py
482
litellm/utils.py
|
@ -981,6 +981,7 @@ class Logging:
|
|||
curl_command = self.model_call_details
|
||||
|
||||
# only print verbose if verbose logger is not set
|
||||
|
||||
if verbose_logger.level == 0:
|
||||
# this means verbose logger was not switched on - user is in litellm.set_verbose=True
|
||||
print_verbose(f"\033[92m{curl_command}\033[0m\n")
|
||||
|
@ -1312,6 +1313,15 @@ class Logging:
|
|||
|
||||
for callback in callbacks:
|
||||
try:
|
||||
litellm_params = self.model_call_details.get("litellm_params", {})
|
||||
if litellm_params.get("no-log", False) == True:
|
||||
# proxy cost tracking cal backs should run
|
||||
if not (
|
||||
isinstance(callback, CustomLogger)
|
||||
and "_PROXY_" in callback.__class__.__name__
|
||||
):
|
||||
print_verbose("no-log request, skipping logging")
|
||||
continue
|
||||
if callback == "lite_debugger":
|
||||
print_verbose("reaches lite_debugger for logging!")
|
||||
print_verbose(f"liteDebuggerClient: {liteDebuggerClient}")
|
||||
|
@ -1740,7 +1750,20 @@ class Logging:
|
|||
callbacks = litellm._async_success_callback
|
||||
verbose_logger.debug(f"Async success callbacks: {callbacks}")
|
||||
for callback in callbacks:
|
||||
# check if callback can run for this request
|
||||
litellm_params = self.model_call_details.get("litellm_params", {})
|
||||
if litellm_params.get("no-log", False) == True:
|
||||
# proxy cost tracking cal backs should run
|
||||
if not (
|
||||
isinstance(callback, CustomLogger)
|
||||
and "_PROXY_" in callback.__class__.__name__
|
||||
):
|
||||
print_verbose("no-log request, skipping logging")
|
||||
continue
|
||||
try:
|
||||
if kwargs.get("no-log", False) == True:
|
||||
print_verbose("no-log request, skipping logging")
|
||||
continue
|
||||
if callback == "cache" and litellm.cache is not None:
|
||||
# set_cache once complete streaming response is built
|
||||
print_verbose("async success_callback: reaches cache for logging!")
|
||||
|
@ -3026,11 +3049,13 @@ def client(original_function):
|
|||
print_verbose(
|
||||
f"Async Wrapper: Completed Call, calling async_success_handler: {logging_obj.async_success_handler}"
|
||||
)
|
||||
# check if user does not want this to be logged
|
||||
asyncio.create_task(
|
||||
logging_obj.async_success_handler(result, start_time, end_time)
|
||||
)
|
||||
threading.Thread(
|
||||
target=logging_obj.success_handler, args=(result, start_time, end_time)
|
||||
target=logging_obj.success_handler,
|
||||
args=(result, start_time, end_time),
|
||||
).start()
|
||||
|
||||
# RETURN RESULT
|
||||
|
@ -3933,6 +3958,7 @@ def get_litellm_params(
|
|||
proxy_server_request=None,
|
||||
acompletion=None,
|
||||
preset_cache_key=None,
|
||||
no_log=None,
|
||||
):
|
||||
litellm_params = {
|
||||
"acompletion": acompletion,
|
||||
|
@ -3949,6 +3975,7 @@ def get_litellm_params(
|
|||
"model_info": model_info,
|
||||
"proxy_server_request": proxy_server_request,
|
||||
"preset_cache_key": preset_cache_key,
|
||||
"no-log": no_log,
|
||||
"stream_response": {}, # litellm_call_id: ModelResponse Dict
|
||||
}
|
||||
|
||||
|
@ -4269,15 +4296,9 @@ def get_optional_params(
|
|||
## raise exception if provider doesn't support passed in param
|
||||
if custom_llm_provider == "anthropic":
|
||||
## check if unsupported param passed in
|
||||
supported_params = [
|
||||
"stream",
|
||||
"stop",
|
||||
"temperature",
|
||||
"top_p",
|
||||
"max_tokens",
|
||||
"tools",
|
||||
"tool_choice",
|
||||
]
|
||||
supported_params = get_supported_openai_params(
|
||||
model=model, custom_llm_provider=custom_llm_provider
|
||||
)
|
||||
_check_valid_arg(supported_params=supported_params)
|
||||
# handle anthropic params
|
||||
if stream:
|
||||
|
@ -4301,17 +4322,9 @@ def get_optional_params(
|
|||
optional_params["tools"] = tools
|
||||
elif custom_llm_provider == "cohere":
|
||||
## check if unsupported param passed in
|
||||
supported_params = [
|
||||
"stream",
|
||||
"temperature",
|
||||
"max_tokens",
|
||||
"logit_bias",
|
||||
"top_p",
|
||||
"frequency_penalty",
|
||||
"presence_penalty",
|
||||
"stop",
|
||||
"n",
|
||||
]
|
||||
supported_params = get_supported_openai_params(
|
||||
model=model, custom_llm_provider=custom_llm_provider
|
||||
)
|
||||
_check_valid_arg(supported_params=supported_params)
|
||||
# handle cohere params
|
||||
if stream:
|
||||
|
@ -4334,14 +4347,9 @@ def get_optional_params(
|
|||
optional_params["stop_sequences"] = stop
|
||||
elif custom_llm_provider == "maritalk":
|
||||
## check if unsupported param passed in
|
||||
supported_params = [
|
||||
"stream",
|
||||
"temperature",
|
||||
"max_tokens",
|
||||
"top_p",
|
||||
"presence_penalty",
|
||||
"stop",
|
||||
]
|
||||
supported_params = get_supported_openai_params(
|
||||
model=model, custom_llm_provider=custom_llm_provider
|
||||
)
|
||||
_check_valid_arg(supported_params=supported_params)
|
||||
# handle cohere params
|
||||
if stream:
|
||||
|
@ -4360,14 +4368,9 @@ def get_optional_params(
|
|||
optional_params["stopping_tokens"] = stop
|
||||
elif custom_llm_provider == "replicate":
|
||||
## check if unsupported param passed in
|
||||
supported_params = [
|
||||
"stream",
|
||||
"temperature",
|
||||
"max_tokens",
|
||||
"top_p",
|
||||
"stop",
|
||||
"seed",
|
||||
]
|
||||
supported_params = get_supported_openai_params(
|
||||
model=model, custom_llm_provider=custom_llm_provider
|
||||
)
|
||||
_check_valid_arg(supported_params=supported_params)
|
||||
|
||||
if stream:
|
||||
|
@ -4388,7 +4391,9 @@ def get_optional_params(
|
|||
optional_params["stop_sequences"] = stop
|
||||
elif custom_llm_provider == "huggingface":
|
||||
## check if unsupported param passed in
|
||||
supported_params = ["stream", "temperature", "max_tokens", "top_p", "stop", "n"]
|
||||
supported_params = get_supported_openai_params(
|
||||
model=model, custom_llm_provider=custom_llm_provider
|
||||
)
|
||||
_check_valid_arg(supported_params=supported_params)
|
||||
# temperature, top_p, n, stream, stop, max_tokens, n, presence_penalty default to None
|
||||
if temperature is not None:
|
||||
|
@ -4427,16 +4432,9 @@ def get_optional_params(
|
|||
) # since we handle translating echo, we should not send it to TGI request
|
||||
elif custom_llm_provider == "together_ai":
|
||||
## check if unsupported param passed in
|
||||
supported_params = [
|
||||
"stream",
|
||||
"temperature",
|
||||
"max_tokens",
|
||||
"top_p",
|
||||
"stop",
|
||||
"frequency_penalty",
|
||||
"tools",
|
||||
"tool_choice",
|
||||
]
|
||||
supported_params = get_supported_openai_params(
|
||||
model=model, custom_llm_provider=custom_llm_provider
|
||||
)
|
||||
_check_valid_arg(supported_params=supported_params)
|
||||
|
||||
if stream:
|
||||
|
@ -4457,16 +4455,9 @@ def get_optional_params(
|
|||
optional_params["tool_choice"] = tool_choice
|
||||
elif custom_llm_provider == "ai21":
|
||||
## check if unsupported param passed in
|
||||
supported_params = [
|
||||
"stream",
|
||||
"n",
|
||||
"temperature",
|
||||
"max_tokens",
|
||||
"top_p",
|
||||
"stop",
|
||||
"frequency_penalty",
|
||||
"presence_penalty",
|
||||
]
|
||||
supported_params = get_supported_openai_params(
|
||||
model=model, custom_llm_provider=custom_llm_provider
|
||||
)
|
||||
_check_valid_arg(supported_params=supported_params)
|
||||
|
||||
if stream:
|
||||
|
@ -4489,7 +4480,9 @@ def get_optional_params(
|
|||
custom_llm_provider == "palm" or custom_llm_provider == "gemini"
|
||||
): # https://developers.generativeai.google/tutorials/curl_quickstart
|
||||
## check if unsupported param passed in
|
||||
supported_params = ["temperature", "top_p", "stream", "n", "stop", "max_tokens"]
|
||||
supported_params = get_supported_openai_params(
|
||||
model=model, custom_llm_provider=custom_llm_provider
|
||||
)
|
||||
_check_valid_arg(supported_params=supported_params)
|
||||
|
||||
if temperature is not None:
|
||||
|
@ -4518,14 +4511,9 @@ def get_optional_params(
|
|||
):
|
||||
print_verbose(f"(start) INSIDE THE VERTEX AI OPTIONAL PARAM BLOCK")
|
||||
## check if unsupported param passed in
|
||||
supported_params = [
|
||||
"temperature",
|
||||
"top_p",
|
||||
"max_tokens",
|
||||
"stream",
|
||||
"tools",
|
||||
"tool_choice",
|
||||
]
|
||||
supported_params = get_supported_openai_params(
|
||||
model=model, custom_llm_provider=custom_llm_provider
|
||||
)
|
||||
_check_valid_arg(supported_params=supported_params)
|
||||
|
||||
if temperature is not None:
|
||||
|
@ -4555,7 +4543,9 @@ def get_optional_params(
|
|||
)
|
||||
elif custom_llm_provider == "sagemaker":
|
||||
## check if unsupported param passed in
|
||||
supported_params = ["stream", "temperature", "max_tokens", "top_p", "stop", "n"]
|
||||
supported_params = get_supported_openai_params(
|
||||
model=model, custom_llm_provider=custom_llm_provider
|
||||
)
|
||||
_check_valid_arg(supported_params=supported_params)
|
||||
# temperature, top_p, n, stream, stop, max_tokens, n, presence_penalty default to None
|
||||
if temperature is not None:
|
||||
|
@ -4582,8 +4572,10 @@ def get_optional_params(
|
|||
max_tokens = 1
|
||||
optional_params["max_new_tokens"] = max_tokens
|
||||
elif custom_llm_provider == "bedrock":
|
||||
supported_params = get_supported_openai_params(
|
||||
model=model, custom_llm_provider=custom_llm_provider
|
||||
)
|
||||
if "ai21" in model:
|
||||
supported_params = ["max_tokens", "temperature", "top_p", "stream"]
|
||||
_check_valid_arg(supported_params=supported_params)
|
||||
# params "maxTokens":200,"temperature":0,"topP":250,"stop_sequences":[],
|
||||
# https://us-west-2.console.aws.amazon.com/bedrock/home?region=us-west-2#/providers?model=j2-ultra
|
||||
|
@ -4596,9 +4588,6 @@ def get_optional_params(
|
|||
if stream:
|
||||
optional_params["stream"] = stream
|
||||
elif "anthropic" in model:
|
||||
supported_params = get_mapped_model_params(
|
||||
model=model, custom_llm_provider=custom_llm_provider
|
||||
)
|
||||
_check_valid_arg(supported_params=supported_params)
|
||||
# anthropic params on bedrock
|
||||
# \"max_tokens_to_sample\":300,\"temperature\":0.5,\"top_p\":1,\"stop_sequences\":[\"\\\\n\\\\nHuman:\"]}"
|
||||
|
@ -4615,7 +4604,6 @@ def get_optional_params(
|
|||
optional_params=optional_params,
|
||||
)
|
||||
elif "amazon" in model: # amazon titan llms
|
||||
supported_params = ["max_tokens", "temperature", "stop", "top_p", "stream"]
|
||||
_check_valid_arg(supported_params=supported_params)
|
||||
# see https://us-west-2.console.aws.amazon.com/bedrock/home?region=us-west-2#/providers?model=titan-large
|
||||
if max_tokens is not None:
|
||||
|
@ -4632,7 +4620,6 @@ def get_optional_params(
|
|||
if stream:
|
||||
optional_params["stream"] = stream
|
||||
elif "meta" in model: # amazon / meta llms
|
||||
supported_params = ["max_tokens", "temperature", "top_p", "stream"]
|
||||
_check_valid_arg(supported_params=supported_params)
|
||||
# see https://us-west-2.console.aws.amazon.com/bedrock/home?region=us-west-2#/providers?model=titan-large
|
||||
if max_tokens is not None:
|
||||
|
@ -4644,7 +4631,6 @@ def get_optional_params(
|
|||
if stream:
|
||||
optional_params["stream"] = stream
|
||||
elif "cohere" in model: # cohere models on bedrock
|
||||
supported_params = ["stream", "temperature", "max_tokens"]
|
||||
_check_valid_arg(supported_params=supported_params)
|
||||
# handle cohere params
|
||||
if stream:
|
||||
|
@ -4654,7 +4640,6 @@ def get_optional_params(
|
|||
if max_tokens is not None:
|
||||
optional_params["max_tokens"] = max_tokens
|
||||
elif "mistral" in model:
|
||||
supported_params = ["max_tokens", "temperature", "stop", "top_p", "stream"]
|
||||
_check_valid_arg(supported_params=supported_params)
|
||||
# mistral params on bedrock
|
||||
# \"max_tokens\":400,\"temperature\":0.7,\"top_p\":0.7,\"stop\":[\"\\\\n\\\\nHuman:\"]}"
|
||||
|
@ -4698,7 +4683,9 @@ def get_optional_params(
|
|||
optional_params["stop_sequences"] = stop
|
||||
elif custom_llm_provider == "cloudflare":
|
||||
# https://developers.cloudflare.com/workers-ai/models/text-generation/#input
|
||||
supported_params = ["max_tokens", "stream"]
|
||||
supported_params = get_supported_openai_params(
|
||||
model=model, custom_llm_provider=custom_llm_provider
|
||||
)
|
||||
_check_valid_arg(supported_params=supported_params)
|
||||
|
||||
if max_tokens is not None:
|
||||
|
@ -4706,14 +4693,9 @@ def get_optional_params(
|
|||
if stream is not None:
|
||||
optional_params["stream"] = stream
|
||||
elif custom_llm_provider == "ollama":
|
||||
supported_params = [
|
||||
"max_tokens",
|
||||
"stream",
|
||||
"top_p",
|
||||
"temperature",
|
||||
"frequency_penalty",
|
||||
"stop",
|
||||
]
|
||||
supported_params = get_supported_openai_params(
|
||||
model=model, custom_llm_provider=custom_llm_provider
|
||||
)
|
||||
_check_valid_arg(supported_params=supported_params)
|
||||
|
||||
if max_tokens is not None:
|
||||
|
@ -4737,16 +4719,9 @@ def get_optional_params(
|
|||
non_default_params=non_default_params, optional_params=optional_params
|
||||
)
|
||||
elif custom_llm_provider == "nlp_cloud":
|
||||
supported_params = [
|
||||
"max_tokens",
|
||||
"stream",
|
||||
"temperature",
|
||||
"top_p",
|
||||
"presence_penalty",
|
||||
"frequency_penalty",
|
||||
"n",
|
||||
"stop",
|
||||
]
|
||||
supported_params = get_supported_openai_params(
|
||||
model=model, custom_llm_provider=custom_llm_provider
|
||||
)
|
||||
_check_valid_arg(supported_params=supported_params)
|
||||
|
||||
if max_tokens is not None:
|
||||
|
@ -4766,7 +4741,9 @@ def get_optional_params(
|
|||
if stop is not None:
|
||||
optional_params["stop_sequences"] = stop
|
||||
elif custom_llm_provider == "petals":
|
||||
supported_params = ["max_tokens", "temperature", "top_p", "stream"]
|
||||
supported_params = get_supported_openai_params(
|
||||
model=model, custom_llm_provider=custom_llm_provider
|
||||
)
|
||||
_check_valid_arg(supported_params=supported_params)
|
||||
# max_new_tokens=1,temperature=0.9, top_p=0.6
|
||||
if max_tokens is not None:
|
||||
|
@ -4778,18 +4755,9 @@ def get_optional_params(
|
|||
if stream:
|
||||
optional_params["stream"] = stream
|
||||
elif custom_llm_provider == "deepinfra":
|
||||
supported_params = [
|
||||
"temperature",
|
||||
"top_p",
|
||||
"n",
|
||||
"stream",
|
||||
"stop",
|
||||
"max_tokens",
|
||||
"presence_penalty",
|
||||
"frequency_penalty",
|
||||
"logit_bias",
|
||||
"user",
|
||||
]
|
||||
supported_params = get_supported_openai_params(
|
||||
model=model, custom_llm_provider=custom_llm_provider
|
||||
)
|
||||
_check_valid_arg(supported_params=supported_params)
|
||||
if temperature is not None:
|
||||
if (
|
||||
|
@ -4816,14 +4784,9 @@ def get_optional_params(
|
|||
if user:
|
||||
optional_params["user"] = user
|
||||
elif custom_llm_provider == "perplexity":
|
||||
supported_params = [
|
||||
"temperature",
|
||||
"top_p",
|
||||
"stream",
|
||||
"max_tokens",
|
||||
"presence_penalty",
|
||||
"frequency_penalty",
|
||||
]
|
||||
supported_params = get_supported_openai_params(
|
||||
model=model, custom_llm_provider=custom_llm_provider
|
||||
)
|
||||
_check_valid_arg(supported_params=supported_params)
|
||||
if temperature is not None:
|
||||
if (
|
||||
|
@ -4842,15 +4805,9 @@ def get_optional_params(
|
|||
if frequency_penalty:
|
||||
optional_params["frequency_penalty"] = frequency_penalty
|
||||
elif custom_llm_provider == "anyscale":
|
||||
supported_params = [
|
||||
"temperature",
|
||||
"top_p",
|
||||
"stream",
|
||||
"max_tokens",
|
||||
"stop",
|
||||
"frequency_penalty",
|
||||
"presence_penalty",
|
||||
]
|
||||
supported_params = get_supported_openai_params(
|
||||
model=model, custom_llm_provider=custom_llm_provider
|
||||
)
|
||||
if model in [
|
||||
"mistralai/Mistral-7B-Instruct-v0.1",
|
||||
"mistralai/Mixtral-8x7B-Instruct-v0.1",
|
||||
|
@ -4878,14 +4835,9 @@ def get_optional_params(
|
|||
if max_tokens:
|
||||
optional_params["max_tokens"] = max_tokens
|
||||
elif custom_llm_provider == "mistral":
|
||||
supported_params = [
|
||||
"temperature",
|
||||
"top_p",
|
||||
"stream",
|
||||
"max_tokens",
|
||||
"tools",
|
||||
"tool_choice",
|
||||
]
|
||||
supported_params = get_supported_openai_params(
|
||||
model=model, custom_llm_provider=custom_llm_provider
|
||||
)
|
||||
_check_valid_arg(supported_params=supported_params)
|
||||
if temperature is not None:
|
||||
optional_params["temperature"] = temperature
|
||||
|
@ -4912,25 +4864,9 @@ def get_optional_params(
|
|||
extra_body # openai client supports `extra_body` param
|
||||
)
|
||||
elif custom_llm_provider == "openrouter":
|
||||
supported_params = [
|
||||
"functions",
|
||||
"function_call",
|
||||
"temperature",
|
||||
"top_p",
|
||||
"n",
|
||||
"stream",
|
||||
"stop",
|
||||
"max_tokens",
|
||||
"presence_penalty",
|
||||
"frequency_penalty",
|
||||
"logit_bias",
|
||||
"user",
|
||||
"response_format",
|
||||
"seed",
|
||||
"tools",
|
||||
"tool_choice",
|
||||
"max_retries",
|
||||
]
|
||||
supported_params = get_supported_openai_params(
|
||||
model=model, custom_llm_provider=custom_llm_provider
|
||||
)
|
||||
_check_valid_arg(supported_params=supported_params)
|
||||
|
||||
if functions is not None:
|
||||
|
@ -4984,28 +4920,9 @@ def get_optional_params(
|
|||
)
|
||||
else: # assume passing in params for openai/azure openai
|
||||
print_verbose(f"UNMAPPED PROVIDER, ASSUMING IT'S OPENAI/AZURE")
|
||||
supported_params = [
|
||||
"functions",
|
||||
"function_call",
|
||||
"temperature",
|
||||
"top_p",
|
||||
"n",
|
||||
"stream",
|
||||
"stop",
|
||||
"max_tokens",
|
||||
"presence_penalty",
|
||||
"frequency_penalty",
|
||||
"logit_bias",
|
||||
"user",
|
||||
"response_format",
|
||||
"seed",
|
||||
"tools",
|
||||
"tool_choice",
|
||||
"max_retries",
|
||||
"logprobs",
|
||||
"top_logprobs",
|
||||
"extra_headers",
|
||||
]
|
||||
supported_params = get_supported_openai_params(
|
||||
model=model, custom_llm_provider="openai"
|
||||
)
|
||||
_check_valid_arg(supported_params=supported_params)
|
||||
if functions is not None:
|
||||
optional_params["functions"] = functions
|
||||
|
@ -5063,15 +4980,228 @@ def get_optional_params(
|
|||
return optional_params
|
||||
|
||||
|
||||
def get_mapped_model_params(model: str, custom_llm_provider: str):
|
||||
def get_supported_openai_params(model: str, custom_llm_provider: str):
|
||||
"""
|
||||
Returns the supported openai params for a given model + provider
|
||||
|
||||
Example:
|
||||
```
|
||||
get_supported_openai_params(model="anthropic.claude-3", custom_llm_provider="bedrock")
|
||||
```
|
||||
"""
|
||||
if custom_llm_provider == "bedrock":
|
||||
if model.startswith("anthropic.claude-3"):
|
||||
return litellm.AmazonAnthropicClaude3Config().get_supported_openai_params()
|
||||
else:
|
||||
elif model.startswith("anthropic"):
|
||||
return litellm.AmazonAnthropicConfig().get_supported_openai_params()
|
||||
elif model.startswith("ai21"):
|
||||
return ["max_tokens", "temperature", "top_p", "stream"]
|
||||
elif model.startswith("amazon"):
|
||||
return ["max_tokens", "temperature", "stop", "top_p", "stream"]
|
||||
elif model.startswith("meta"):
|
||||
return ["max_tokens", "temperature", "top_p", "stream"]
|
||||
elif model.startswith("cohere"):
|
||||
return ["stream", "temperature", "max_tokens"]
|
||||
elif model.startswith("mistral"):
|
||||
return ["max_tokens", "temperature", "stop", "top_p", "stream"]
|
||||
elif custom_llm_provider == "ollama_chat":
|
||||
return litellm.OllamaChatConfig().get_supported_openai_params()
|
||||
elif custom_llm_provider == "anthropic":
|
||||
return [
|
||||
"stream",
|
||||
"stop",
|
||||
"temperature",
|
||||
"top_p",
|
||||
"max_tokens",
|
||||
"tools",
|
||||
"tool_choice",
|
||||
]
|
||||
elif custom_llm_provider == "cohere":
|
||||
return [
|
||||
"stream",
|
||||
"temperature",
|
||||
"max_tokens",
|
||||
"logit_bias",
|
||||
"top_p",
|
||||
"frequency_penalty",
|
||||
"presence_penalty",
|
||||
"stop",
|
||||
"n",
|
||||
]
|
||||
elif custom_llm_provider == "maritalk":
|
||||
return [
|
||||
"stream",
|
||||
"temperature",
|
||||
"max_tokens",
|
||||
"top_p",
|
||||
"presence_penalty",
|
||||
"stop",
|
||||
]
|
||||
elif custom_llm_provider == "openai" or custom_llm_provider == "azure":
|
||||
return [
|
||||
"functions",
|
||||
"function_call",
|
||||
"temperature",
|
||||
"top_p",
|
||||
"n",
|
||||
"stream",
|
||||
"stop",
|
||||
"max_tokens",
|
||||
"presence_penalty",
|
||||
"frequency_penalty",
|
||||
"logit_bias",
|
||||
"user",
|
||||
"response_format",
|
||||
"seed",
|
||||
"tools",
|
||||
"tool_choice",
|
||||
"max_retries",
|
||||
"logprobs",
|
||||
"top_logprobs",
|
||||
"extra_headers",
|
||||
]
|
||||
elif custom_llm_provider == "openrouter":
|
||||
return [
|
||||
"functions",
|
||||
"function_call",
|
||||
"temperature",
|
||||
"top_p",
|
||||
"n",
|
||||
"stream",
|
||||
"stop",
|
||||
"max_tokens",
|
||||
"presence_penalty",
|
||||
"frequency_penalty",
|
||||
"logit_bias",
|
||||
"user",
|
||||
"response_format",
|
||||
"seed",
|
||||
"tools",
|
||||
"tool_choice",
|
||||
"max_retries",
|
||||
]
|
||||
elif custom_llm_provider == "mistral":
|
||||
return [
|
||||
"temperature",
|
||||
"top_p",
|
||||
"stream",
|
||||
"max_tokens",
|
||||
"tools",
|
||||
"tool_choice",
|
||||
]
|
||||
elif custom_llm_provider == "replicate":
|
||||
return [
|
||||
"stream",
|
||||
"temperature",
|
||||
"max_tokens",
|
||||
"top_p",
|
||||
"stop",
|
||||
"seed",
|
||||
]
|
||||
elif custom_llm_provider == "huggingface":
|
||||
return ["stream", "temperature", "max_tokens", "top_p", "stop", "n"]
|
||||
elif custom_llm_provider == "together_ai":
|
||||
return [
|
||||
"stream",
|
||||
"temperature",
|
||||
"max_tokens",
|
||||
"top_p",
|
||||
"stop",
|
||||
"frequency_penalty",
|
||||
"tools",
|
||||
"tool_choice",
|
||||
]
|
||||
elif custom_llm_provider == "ai21":
|
||||
return [
|
||||
"stream",
|
||||
"n",
|
||||
"temperature",
|
||||
"max_tokens",
|
||||
"top_p",
|
||||
"stop",
|
||||
"frequency_penalty",
|
||||
"presence_penalty",
|
||||
]
|
||||
elif custom_llm_provider == "palm" or custom_llm_provider == "gemini":
|
||||
return ["temperature", "top_p", "stream", "n", "stop", "max_tokens"]
|
||||
elif custom_llm_provider == "vertex_ai":
|
||||
return [
|
||||
"temperature",
|
||||
"top_p",
|
||||
"max_tokens",
|
||||
"stream",
|
||||
"tools",
|
||||
"tool_choice",
|
||||
]
|
||||
elif custom_llm_provider == "sagemaker":
|
||||
return ["stream", "temperature", "max_tokens", "top_p", "stop", "n"]
|
||||
elif custom_llm_provider == "aleph_alpha":
|
||||
return [
|
||||
"max_tokens",
|
||||
"stream",
|
||||
"top_p",
|
||||
"temperature",
|
||||
"presence_penalty",
|
||||
"frequency_penalty",
|
||||
"n",
|
||||
"stop",
|
||||
]
|
||||
elif custom_llm_provider == "cloudflare":
|
||||
return ["max_tokens", "stream"]
|
||||
elif custom_llm_provider == "ollama":
|
||||
return [
|
||||
"max_tokens",
|
||||
"stream",
|
||||
"top_p",
|
||||
"temperature",
|
||||
"frequency_penalty",
|
||||
"stop",
|
||||
]
|
||||
elif custom_llm_provider == "nlp_cloud":
|
||||
return [
|
||||
"max_tokens",
|
||||
"stream",
|
||||
"temperature",
|
||||
"top_p",
|
||||
"presence_penalty",
|
||||
"frequency_penalty",
|
||||
"n",
|
||||
"stop",
|
||||
]
|
||||
elif custom_llm_provider == "petals":
|
||||
return ["max_tokens", "temperature", "top_p", "stream"]
|
||||
elif custom_llm_provider == "deepinfra":
|
||||
return [
|
||||
"temperature",
|
||||
"top_p",
|
||||
"n",
|
||||
"stream",
|
||||
"stop",
|
||||
"max_tokens",
|
||||
"presence_penalty",
|
||||
"frequency_penalty",
|
||||
"logit_bias",
|
||||
"user",
|
||||
]
|
||||
elif custom_llm_provider == "perplexity":
|
||||
return [
|
||||
"temperature",
|
||||
"top_p",
|
||||
"stream",
|
||||
"max_tokens",
|
||||
"presence_penalty",
|
||||
"frequency_penalty",
|
||||
]
|
||||
elif custom_llm_provider == "anyscale":
|
||||
return [
|
||||
"temperature",
|
||||
"top_p",
|
||||
"stream",
|
||||
"max_tokens",
|
||||
"stop",
|
||||
"frequency_penalty",
|
||||
"presence_penalty",
|
||||
]
|
||||
|
||||
|
||||
def get_llm_provider(
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
[tool.poetry]
|
||||
name = "litellm"
|
||||
version = "1.30.3"
|
||||
version = "1.30.4"
|
||||
description = "Library to easily interface with LLM API providers"
|
||||
authors = ["BerriAI"]
|
||||
license = "MIT"
|
||||
|
@ -74,7 +74,7 @@ requires = ["poetry-core", "wheel"]
|
|||
build-backend = "poetry.core.masonry.api"
|
||||
|
||||
[tool.commitizen]
|
||||
version = "1.30.3"
|
||||
version = "1.30.4"
|
||||
version_files = [
|
||||
"pyproject.toml:^version"
|
||||
]
|
||||
|
|
|
@ -9,6 +9,7 @@ import sys, os, dotenv
|
|||
from typing import Optional
|
||||
from dotenv import load_dotenv
|
||||
|
||||
# Get the current directory of the file being run
|
||||
pwd = os.path.dirname(os.path.realpath(__file__))
|
||||
print(pwd)
|
||||
|
||||
|
@ -37,12 +38,13 @@ def test_transcription():
|
|||
|
||||
|
||||
def test_transcription_azure():
|
||||
litellm.set_verbose = True
|
||||
transcript = litellm.transcription(
|
||||
model="azure/azure-whisper",
|
||||
file=audio_file,
|
||||
api_base=os.getenv("AZURE_EUROPE_API_BASE"),
|
||||
api_base="https://my-endpoint-europe-berri-992.openai.azure.com/",
|
||||
api_key=os.getenv("AZURE_EUROPE_API_KEY"),
|
||||
api_version=os.getenv("2024-02-15-preview"),
|
||||
api_version="2024-02-15-preview",
|
||||
)
|
||||
|
||||
assert transcript.text is not None
|
||||
|
@ -57,9 +59,9 @@ async def test_transcription_async_azure():
|
|||
transcript = await litellm.atranscription(
|
||||
model="azure/azure-whisper",
|
||||
file=audio_file,
|
||||
api_base=os.getenv("AZURE_EUROPE_API_BASE"),
|
||||
api_base="https://my-endpoint-europe-berri-992.openai.azure.com/",
|
||||
api_key=os.getenv("AZURE_EUROPE_API_KEY"),
|
||||
api_version=os.getenv("2024-02-15-preview"),
|
||||
api_version="2024-02-15-preview",
|
||||
)
|
||||
|
||||
assert transcript.text is not None
|
||||
|
@ -96,7 +98,7 @@ async def test_transcription_on_router():
|
|||
"model_name": "whisper",
|
||||
"litellm_params": {
|
||||
"model": "azure/azure-whisper",
|
||||
"api_base": os.getenv("AZURE_EUROPE_API_BASE"),
|
||||
"api_base": "https://my-endpoint-europe-berri-992.openai.azure.com/",
|
||||
"api_key": os.getenv("AZURE_EUROPE_API_KEY"),
|
||||
"api_version": "2024-02-15-preview",
|
||||
},
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue