forked from phoenix/litellm-mirror
Merge branch 'main' into litellm_load_balancing_transcription_endpoints
This commit is contained in:
commit
caa99f43bf
22 changed files with 704 additions and 233 deletions
|
@ -1,5 +1,84 @@
|
||||||
|
import Image from '@theme/IdealImage';
|
||||||
|
|
||||||
# 🔥 Load Test LiteLLM
|
# 🔥 Load Test LiteLLM
|
||||||
|
|
||||||
|
## Load Test LiteLLM Proxy - 1500+ req/s
|
||||||
|
|
||||||
|
## 1500+ concurrent requests/s
|
||||||
|
|
||||||
|
LiteLLM proxy has been load tested to handle 1500+ concurrent req/s
|
||||||
|
|
||||||
|
```python
|
||||||
|
import time, asyncio
|
||||||
|
from openai import AsyncOpenAI, AsyncAzureOpenAI
|
||||||
|
import uuid
|
||||||
|
import traceback
|
||||||
|
|
||||||
|
# base_url - litellm proxy endpoint
|
||||||
|
# api_key - litellm proxy api-key, is created proxy with auth
|
||||||
|
litellm_client = AsyncOpenAI(base_url="http://0.0.0.0:4000", api_key="sk-1234")
|
||||||
|
|
||||||
|
|
||||||
|
async def litellm_completion():
|
||||||
|
# Your existing code for litellm_completion goes here
|
||||||
|
try:
|
||||||
|
response = await litellm_client.chat.completions.create(
|
||||||
|
model="azure-gpt-3.5",
|
||||||
|
messages=[{"role": "user", "content": f"This is a test: {uuid.uuid4()}"}],
|
||||||
|
)
|
||||||
|
print(response)
|
||||||
|
return response
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
# If there's an exception, log the error message
|
||||||
|
with open("error_log.txt", "a") as error_log:
|
||||||
|
error_log.write(f"Error during completion: {str(e)}\n")
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
for i in range(1):
|
||||||
|
start = time.time()
|
||||||
|
n = 1500 # Number of concurrent tasks
|
||||||
|
tasks = [litellm_completion() for _ in range(n)]
|
||||||
|
|
||||||
|
chat_completions = await asyncio.gather(*tasks)
|
||||||
|
|
||||||
|
successful_completions = [c for c in chat_completions if c is not None]
|
||||||
|
|
||||||
|
# Write errors to error_log.txt
|
||||||
|
with open("error_log.txt", "a") as error_log:
|
||||||
|
for completion in chat_completions:
|
||||||
|
if isinstance(completion, str):
|
||||||
|
error_log.write(completion + "\n")
|
||||||
|
|
||||||
|
print(n, time.time() - start, len(successful_completions))
|
||||||
|
time.sleep(10)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# Blank out contents of error_log.txt
|
||||||
|
open("error_log.txt", "w").close()
|
||||||
|
|
||||||
|
asyncio.run(main())
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
### Throughput - 30% Increase
|
||||||
|
LiteLLM proxy + Load Balancer gives **30% increase** in throughput compared to Raw OpenAI API
|
||||||
|
<Image img={require('../img/throughput.png')} />
|
||||||
|
|
||||||
|
### Latency Added - 0.00325 seconds
|
||||||
|
LiteLLM proxy adds **0.00325 seconds** latency as compared to using the Raw OpenAI API
|
||||||
|
<Image img={require('../img/latency.png')} />
|
||||||
|
|
||||||
|
|
||||||
|
### Testing LiteLLM Proxy with Locust
|
||||||
|
- 1 LiteLLM container can handle ~140 requests/second with 0.4 failures
|
||||||
|
|
||||||
|
<Image img={require('../img/locust.png')} />
|
||||||
|
|
||||||
|
## Load Test LiteLLM SDK vs OpenAI
|
||||||
Here is a script to load test LiteLLM vs OpenAI
|
Here is a script to load test LiteLLM vs OpenAI
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
@ -85,3 +164,4 @@ async def loadtest_fn():
|
||||||
asyncio.run(loadtest_fn())
|
asyncio.run(loadtest_fn())
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
|
@ -49,9 +49,9 @@ model_list:
|
||||||
rpm: 6
|
rpm: 6
|
||||||
- model_name: anthropic-claude
|
- model_name: anthropic-claude
|
||||||
litellm_params:
|
litellm_params:
|
||||||
model="bedrock/anthropic.claude-instant-v1"
|
model: bedrock/anthropic.claude-instant-v1
|
||||||
### [OPTIONAL] SET AWS REGION ###
|
### [OPTIONAL] SET AWS REGION ###
|
||||||
aws_region_name="us-east-1"
|
aws_region_name: us-east-1
|
||||||
- model_name: vllm-models
|
- model_name: vllm-models
|
||||||
litellm_params:
|
litellm_params:
|
||||||
model: openai/facebook/opt-125m # the `openai/` prefix tells litellm it's openai compatible
|
model: openai/facebook/opt-125m # the `openai/` prefix tells litellm it's openai compatible
|
||||||
|
|
|
@ -68,6 +68,72 @@ CMD ["--port", "4000", "--config", "config.yaml", "--detailed_debug", "--run_gun
|
||||||
|
|
||||||
</TabItem>
|
</TabItem>
|
||||||
|
|
||||||
|
<TabItem value="kubernetes" label="Kubernetes">
|
||||||
|
|
||||||
|
Deploying a config file based litellm instance just requires a simple deployment that loads
|
||||||
|
the config.yaml file via a config map. Also it would be a good practice to use the env var
|
||||||
|
declaration for api keys, and attach the env vars with the api key values as an opaque secret.
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
apiVersion: v1
|
||||||
|
kind: ConfigMap
|
||||||
|
metadata:
|
||||||
|
name: litellm-config-file
|
||||||
|
data:
|
||||||
|
config.yaml: |
|
||||||
|
model_list:
|
||||||
|
- model_name: gpt-3.5-turbo
|
||||||
|
litellm_params:
|
||||||
|
model: azure/gpt-turbo-small-ca
|
||||||
|
api_base: https://my-endpoint-canada-berri992.openai.azure.com/
|
||||||
|
api_key: os.environ/CA_AZURE_OPENAI_API_KEY
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Secret
|
||||||
|
type: Opaque
|
||||||
|
metadata:
|
||||||
|
name: litellm-secrets
|
||||||
|
data:
|
||||||
|
CA_AZURE_OPENAI_API_KEY: bWVvd19pbV9hX2NhdA== # your api key in base64
|
||||||
|
---
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: litellm-deployment
|
||||||
|
labels:
|
||||||
|
app: litellm
|
||||||
|
spec:
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app: litellm
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app: litellm
|
||||||
|
spec:
|
||||||
|
containers:
|
||||||
|
- name: litellm
|
||||||
|
image: ghcr.io/berriai/litellm:main-latest # it is recommended to fix a version generally
|
||||||
|
ports:
|
||||||
|
- containerPort: 4000
|
||||||
|
volumeMounts:
|
||||||
|
- name: config-volume
|
||||||
|
mountPath: /app/proxy_server_config.yaml
|
||||||
|
subPath: config.yaml
|
||||||
|
envFrom:
|
||||||
|
- secretRef:
|
||||||
|
name: litellm-secrets
|
||||||
|
volumes:
|
||||||
|
- name: config-volume
|
||||||
|
configMap:
|
||||||
|
name: litellm-config-file
|
||||||
|
```
|
||||||
|
|
||||||
|
> [!TIP]
|
||||||
|
> To avoid issues with predictability, difficulties in rollback, and inconsistent environments, use versioning or SHA digests (for example, `litellm:main-v1.30.3` or `litellm@sha256:12345abcdef...`) instead of `litellm:main-latest`.
|
||||||
|
|
||||||
|
</TabItem>
|
||||||
|
|
||||||
</Tabs>
|
</Tabs>
|
||||||
|
|
||||||
## Deploy with Database
|
## Deploy with Database
|
||||||
|
@ -350,17 +416,3 @@ Run the command `docker-compose up` or `docker compose up` as per your docker in
|
||||||
|
|
||||||
|
|
||||||
Your LiteLLM container should be running now on the defined port e.g. `8000`.
|
Your LiteLLM container should be running now on the defined port e.g. `8000`.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## LiteLLM Proxy Performance
|
|
||||||
|
|
||||||
LiteLLM proxy has been load tested to handle 1500 req/s.
|
|
||||||
|
|
||||||
### Throughput - 30% Increase
|
|
||||||
LiteLLM proxy + Load Balancer gives **30% increase** in throughput compared to Raw OpenAI API
|
|
||||||
<Image img={require('../../img/throughput.png')} />
|
|
||||||
|
|
||||||
### Latency Added - 0.00325 seconds
|
|
||||||
LiteLLM proxy adds **0.00325 seconds** latency as compared to using the Raw OpenAI API
|
|
||||||
<Image img={require('../../img/latency.png')} />
|
|
||||||
|
|
|
@ -12,14 +12,16 @@ Features here are behind a commercial license in our `/enterprise` folder. [**Se
|
||||||
:::
|
:::
|
||||||
|
|
||||||
Features:
|
Features:
|
||||||
- [ ] Content Moderation with LlamaGuard
|
- ✅ Content Moderation with LlamaGuard
|
||||||
- [ ] Content Moderation with Google Text Moderations
|
- ✅ Content Moderation with Google Text Moderations
|
||||||
- [ ] Content Moderation with LLM Guard
|
- ✅ Content Moderation with LLM Guard
|
||||||
- [ ] Reject calls from Blocked User list
|
- ✅ Reject calls from Blocked User list
|
||||||
- [ ] Reject calls (incoming / outgoing) with Banned Keywords (e.g. competitors)
|
- ✅ Reject calls (incoming / outgoing) with Banned Keywords (e.g. competitors)
|
||||||
- [ ] Tracking Spend for Custom Tags
|
- ✅ Don't log/store specific requests (eg confidential LLM requests)
|
||||||
|
- ✅ Tracking Spend for Custom Tags
|
||||||
|
|
||||||
## Content Moderation with LlamaGuard
|
## Content Moderation
|
||||||
|
### Content Moderation with LlamaGuard
|
||||||
|
|
||||||
Currently works with Sagemaker's LlamaGuard endpoint.
|
Currently works with Sagemaker's LlamaGuard endpoint.
|
||||||
|
|
||||||
|
@ -39,7 +41,7 @@ os.environ["AWS_SECRET_ACCESS_KEY"] = ""
|
||||||
os.environ["AWS_REGION_NAME"] = ""
|
os.environ["AWS_REGION_NAME"] = ""
|
||||||
```
|
```
|
||||||
|
|
||||||
### Customize LlamaGuard prompt
|
#### Customize LlamaGuard prompt
|
||||||
|
|
||||||
To modify the unsafe categories llama guard evaluates against, just create your own version of [this category list](https://github.com/BerriAI/litellm/blob/main/litellm/proxy/llamaguard_prompt.txt)
|
To modify the unsafe categories llama guard evaluates against, just create your own version of [this category list](https://github.com/BerriAI/litellm/blob/main/litellm/proxy/llamaguard_prompt.txt)
|
||||||
|
|
||||||
|
@ -51,7 +53,7 @@ callbacks: ["llamaguard_moderations"]
|
||||||
llamaguard_unsafe_content_categories: /path/to/llamaguard_prompt.txt
|
llamaguard_unsafe_content_categories: /path/to/llamaguard_prompt.txt
|
||||||
```
|
```
|
||||||
|
|
||||||
## Content Moderation with LLM Guard
|
### Content Moderation with LLM Guard
|
||||||
|
|
||||||
Set the LLM Guard API Base in your environment
|
Set the LLM Guard API Base in your environment
|
||||||
|
|
||||||
|
@ -78,7 +80,7 @@ Expected results:
|
||||||
LLM Guard: Received response - {"sanitized_prompt": "hello world", "is_valid": true, "scanners": { "Regex": 0.0 }}
|
LLM Guard: Received response - {"sanitized_prompt": "hello world", "is_valid": true, "scanners": { "Regex": 0.0 }}
|
||||||
```
|
```
|
||||||
|
|
||||||
## Content Moderation with Google Text Moderation
|
### Content Moderation with Google Text Moderation
|
||||||
|
|
||||||
Requires your GOOGLE_APPLICATION_CREDENTIALS to be set in your .env (same as VertexAI).
|
Requires your GOOGLE_APPLICATION_CREDENTIALS to be set in your .env (same as VertexAI).
|
||||||
|
|
||||||
|
@ -89,7 +91,7 @@ litellm_settings:
|
||||||
callbacks: ["google_text_moderation"]
|
callbacks: ["google_text_moderation"]
|
||||||
```
|
```
|
||||||
|
|
||||||
### Set custom confidence thresholds
|
#### Set custom confidence thresholds
|
||||||
|
|
||||||
Google Moderations checks the test against several categories. [Source](https://cloud.google.com/natural-language/docs/moderating-text#safety_attribute_confidence_scores)
|
Google Moderations checks the test against several categories. [Source](https://cloud.google.com/natural-language/docs/moderating-text#safety_attribute_confidence_scores)
|
||||||
|
|
||||||
|
@ -133,6 +135,33 @@ Here are the category specific values:
|
||||||
| "legal" | legal_threshold: 0.1 |
|
| "legal" | legal_threshold: 0.1 |
|
||||||
|
|
||||||
|
|
||||||
|
## Incognito Requests - Don't log anything
|
||||||
|
|
||||||
|
When `no-log=True`, the request will **not be logged on any callbacks** and there will be **no server logs on litellm**
|
||||||
|
|
||||||
|
```python
|
||||||
|
import openai
|
||||||
|
client = openai.OpenAI(
|
||||||
|
api_key="anything", # proxy api-key
|
||||||
|
base_url="http://0.0.0.0:8000" # litellm proxy
|
||||||
|
)
|
||||||
|
|
||||||
|
response = client.chat.completions.create(
|
||||||
|
model="gpt-3.5-turbo",
|
||||||
|
messages = [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "this is a test request, write a short poem"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
extra_body={
|
||||||
|
"no-log": True
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
print(response)
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
## Enable Blocked User Lists
|
## Enable Blocked User Lists
|
||||||
If any call is made to proxy with this user id, it'll be rejected - use this if you want to let users opt-out of ai features
|
If any call is made to proxy with this user id, it'll be rejected - use this if you want to let users opt-out of ai features
|
||||||
|
|
BIN
docs/my-website/img/locust.png
Normal file
BIN
docs/my-website/img/locust.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 109 KiB |
|
@ -570,7 +570,7 @@ from .utils import (
|
||||||
_calculate_retry_after,
|
_calculate_retry_after,
|
||||||
_should_retry,
|
_should_retry,
|
||||||
get_secret,
|
get_secret,
|
||||||
get_mapped_model_params,
|
get_supported_openai_params,
|
||||||
)
|
)
|
||||||
from .llms.huggingface_restapi import HuggingfaceConfig
|
from .llms.huggingface_restapi import HuggingfaceConfig
|
||||||
from .llms.anthropic import AnthropicConfig
|
from .llms.anthropic import AnthropicConfig
|
||||||
|
|
|
@ -31,6 +31,18 @@ def _turn_on_debug():
|
||||||
verbose_proxy_logger.setLevel(level=logging.DEBUG) # set proxy logs to debug
|
verbose_proxy_logger.setLevel(level=logging.DEBUG) # set proxy logs to debug
|
||||||
|
|
||||||
|
|
||||||
|
def _disable_debugging():
|
||||||
|
verbose_logger.disabled = True
|
||||||
|
verbose_router_logger.disabled = True
|
||||||
|
verbose_proxy_logger.disabled = True
|
||||||
|
|
||||||
|
|
||||||
|
def _enable_debugging():
|
||||||
|
verbose_logger.disabled = False
|
||||||
|
verbose_router_logger.disabled = False
|
||||||
|
verbose_proxy_logger.disabled = False
|
||||||
|
|
||||||
|
|
||||||
def print_verbose(print_statement):
|
def print_verbose(print_statement):
|
||||||
try:
|
try:
|
||||||
if set_verbose:
|
if set_verbose:
|
||||||
|
|
|
@ -15,6 +15,7 @@ import litellm, json
|
||||||
import httpx
|
import httpx
|
||||||
from .custom_httpx.azure_dall_e_2 import CustomHTTPTransport, AsyncCustomHTTPTransport
|
from .custom_httpx.azure_dall_e_2 import CustomHTTPTransport, AsyncCustomHTTPTransport
|
||||||
from openai import AzureOpenAI, AsyncAzureOpenAI
|
from openai import AzureOpenAI, AsyncAzureOpenAI
|
||||||
|
import uuid
|
||||||
|
|
||||||
|
|
||||||
class AzureOpenAIError(Exception):
|
class AzureOpenAIError(Exception):
|
||||||
|
@ -271,6 +272,14 @@ class AzureChatCompletion(BaseLLM):
|
||||||
azure_client = AzureOpenAI(**azure_client_params)
|
azure_client = AzureOpenAI(**azure_client_params)
|
||||||
else:
|
else:
|
||||||
azure_client = client
|
azure_client = client
|
||||||
|
if api_version is not None and isinstance(
|
||||||
|
azure_client._custom_query, dict
|
||||||
|
):
|
||||||
|
# set api_version to version passed by user
|
||||||
|
azure_client._custom_query.setdefault(
|
||||||
|
"api-version", api_version
|
||||||
|
)
|
||||||
|
|
||||||
response = azure_client.chat.completions.create(**data, timeout=timeout) # type: ignore
|
response = azure_client.chat.completions.create(**data, timeout=timeout) # type: ignore
|
||||||
stringified_response = response.model_dump()
|
stringified_response = response.model_dump()
|
||||||
## LOGGING
|
## LOGGING
|
||||||
|
@ -334,10 +343,17 @@ class AzureChatCompletion(BaseLLM):
|
||||||
azure_client_params["api_key"] = api_key
|
azure_client_params["api_key"] = api_key
|
||||||
elif azure_ad_token is not None:
|
elif azure_ad_token is not None:
|
||||||
azure_client_params["azure_ad_token"] = azure_ad_token
|
azure_client_params["azure_ad_token"] = azure_ad_token
|
||||||
|
|
||||||
|
# setting Azure client
|
||||||
if client is None:
|
if client is None:
|
||||||
azure_client = AsyncAzureOpenAI(**azure_client_params)
|
azure_client = AsyncAzureOpenAI(**azure_client_params)
|
||||||
else:
|
else:
|
||||||
azure_client = client
|
azure_client = client
|
||||||
|
if api_version is not None and isinstance(
|
||||||
|
azure_client._custom_query, dict
|
||||||
|
):
|
||||||
|
# set api_version to version passed by user
|
||||||
|
azure_client._custom_query.setdefault("api-version", api_version)
|
||||||
## LOGGING
|
## LOGGING
|
||||||
logging_obj.pre_call(
|
logging_obj.pre_call(
|
||||||
input=data["messages"],
|
input=data["messages"],
|
||||||
|
@ -402,6 +418,9 @@ class AzureChatCompletion(BaseLLM):
|
||||||
azure_client = AzureOpenAI(**azure_client_params)
|
azure_client = AzureOpenAI(**azure_client_params)
|
||||||
else:
|
else:
|
||||||
azure_client = client
|
azure_client = client
|
||||||
|
if api_version is not None and isinstance(azure_client._custom_query, dict):
|
||||||
|
# set api_version to version passed by user
|
||||||
|
azure_client._custom_query.setdefault("api-version", api_version)
|
||||||
## LOGGING
|
## LOGGING
|
||||||
logging_obj.pre_call(
|
logging_obj.pre_call(
|
||||||
input=data["messages"],
|
input=data["messages"],
|
||||||
|
@ -455,6 +474,11 @@ class AzureChatCompletion(BaseLLM):
|
||||||
azure_client = AsyncAzureOpenAI(**azure_client_params)
|
azure_client = AsyncAzureOpenAI(**azure_client_params)
|
||||||
else:
|
else:
|
||||||
azure_client = client
|
azure_client = client
|
||||||
|
if api_version is not None and isinstance(
|
||||||
|
azure_client._custom_query, dict
|
||||||
|
):
|
||||||
|
# set api_version to version passed by user
|
||||||
|
azure_client._custom_query.setdefault("api-version", api_version)
|
||||||
## LOGGING
|
## LOGGING
|
||||||
logging_obj.pre_call(
|
logging_obj.pre_call(
|
||||||
input=data["messages"],
|
input=data["messages"],
|
||||||
|
@ -813,6 +837,19 @@ class AzureChatCompletion(BaseLLM):
|
||||||
azure_client = AzureOpenAI(http_client=litellm.client_session, **azure_client_params) # type: ignore
|
azure_client = AzureOpenAI(http_client=litellm.client_session, **azure_client_params) # type: ignore
|
||||||
else:
|
else:
|
||||||
azure_client = client
|
azure_client = client
|
||||||
|
|
||||||
|
## LOGGING
|
||||||
|
logging_obj.pre_call(
|
||||||
|
input=f"audio_file_{uuid.uuid4()}",
|
||||||
|
api_key=azure_client.api_key,
|
||||||
|
additional_args={
|
||||||
|
"headers": {"Authorization": f"Bearer {azure_client.api_key}"},
|
||||||
|
"api_base": azure_client._base_url._uri_reference,
|
||||||
|
"atranscription": True,
|
||||||
|
"complete_input_dict": data,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
response = azure_client.audio.transcriptions.create(
|
response = azure_client.audio.transcriptions.create(
|
||||||
**data, timeout=timeout # type: ignore
|
**data, timeout=timeout # type: ignore
|
||||||
)
|
)
|
||||||
|
@ -850,6 +887,20 @@ class AzureChatCompletion(BaseLLM):
|
||||||
else:
|
else:
|
||||||
async_azure_client = client
|
async_azure_client = client
|
||||||
|
|
||||||
|
## LOGGING
|
||||||
|
logging_obj.pre_call(
|
||||||
|
input=f"audio_file_{uuid.uuid4()}",
|
||||||
|
api_key=async_azure_client.api_key,
|
||||||
|
additional_args={
|
||||||
|
"headers": {
|
||||||
|
"Authorization": f"Bearer {async_azure_client.api_key}"
|
||||||
|
},
|
||||||
|
"api_base": async_azure_client._base_url._uri_reference,
|
||||||
|
"atranscription": True,
|
||||||
|
"complete_input_dict": data,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
response = await async_azure_client.audio.transcriptions.create(
|
response = await async_azure_client.audio.transcriptions.create(
|
||||||
**data, timeout=timeout
|
**data, timeout=timeout
|
||||||
) # type: ignore
|
) # type: ignore
|
||||||
|
|
|
@ -238,14 +238,22 @@ class OpenAIChatCompletion(BaseLLM):
|
||||||
status_code=422, message=f"Timeout needs to be a float"
|
status_code=422, message=f"Timeout needs to be a float"
|
||||||
)
|
)
|
||||||
|
|
||||||
if custom_llm_provider == "mistral":
|
if custom_llm_provider != "openai":
|
||||||
# check if message content passed in as list, and not string
|
# process all OpenAI compatible provider logic here
|
||||||
messages = prompt_factory(
|
if custom_llm_provider == "mistral":
|
||||||
model=model,
|
# check if message content passed in as list, and not string
|
||||||
messages=messages,
|
messages = prompt_factory(
|
||||||
custom_llm_provider=custom_llm_provider,
|
model=model,
|
||||||
)
|
messages=messages,
|
||||||
|
custom_llm_provider=custom_llm_provider,
|
||||||
|
)
|
||||||
|
if custom_llm_provider == "perplexity" and messages is not None:
|
||||||
|
# check if messages.name is passed + supported, if not supported remove
|
||||||
|
messages = prompt_factory(
|
||||||
|
model=model,
|
||||||
|
messages=messages,
|
||||||
|
custom_llm_provider=custom_llm_provider,
|
||||||
|
)
|
||||||
for _ in range(
|
for _ in range(
|
||||||
2
|
2
|
||||||
): # if call fails due to alternating messages, retry with reformatted message
|
): # if call fails due to alternating messages, retry with reformatted message
|
||||||
|
|
|
@ -556,6 +556,7 @@ def anthropic_messages_pt(messages: list):
|
||||||
3. Each message must alternate between "user" and "assistant" (this is not addressed as now by litellm)
|
3. Each message must alternate between "user" and "assistant" (this is not addressed as now by litellm)
|
||||||
4. final assistant content cannot end with trailing whitespace (anthropic raises an error otherwise)
|
4. final assistant content cannot end with trailing whitespace (anthropic raises an error otherwise)
|
||||||
5. System messages are a separate param to the Messages API (used for tool calling)
|
5. System messages are a separate param to the Messages API (used for tool calling)
|
||||||
|
6. Ensure we only accept role, content. (message.name is not supported)
|
||||||
"""
|
"""
|
||||||
## Ensure final assistant message has no trailing whitespace
|
## Ensure final assistant message has no trailing whitespace
|
||||||
last_assistant_message_idx: Optional[int] = None
|
last_assistant_message_idx: Optional[int] = None
|
||||||
|
@ -583,7 +584,9 @@ def anthropic_messages_pt(messages: list):
|
||||||
new_content.append({"type": "text", "text": m["text"]})
|
new_content.append({"type": "text", "text": m["text"]})
|
||||||
new_messages.append({"role": messages[0]["role"], "content": new_content}) # type: ignore
|
new_messages.append({"role": messages[0]["role"], "content": new_content}) # type: ignore
|
||||||
else:
|
else:
|
||||||
new_messages.append(messages[0])
|
new_messages.append(
|
||||||
|
{"role": messages[0]["role"], "content": messages[0]["content"]}
|
||||||
|
)
|
||||||
|
|
||||||
return new_messages
|
return new_messages
|
||||||
|
|
||||||
|
@ -606,7 +609,9 @@ def anthropic_messages_pt(messages: list):
|
||||||
new_content.append({"type": "text", "content": m["text"]})
|
new_content.append({"type": "text", "content": m["text"]})
|
||||||
new_messages.append({"role": messages[i]["role"], "content": new_content}) # type: ignore
|
new_messages.append({"role": messages[i]["role"], "content": new_content}) # type: ignore
|
||||||
else:
|
else:
|
||||||
new_messages.append(messages[i])
|
new_messages.append(
|
||||||
|
{"role": messages[i]["role"], "content": messages[i]["content"]}
|
||||||
|
)
|
||||||
|
|
||||||
if messages[i]["role"] == messages[i + 1]["role"]:
|
if messages[i]["role"] == messages[i + 1]["role"]:
|
||||||
if messages[i]["role"] == "user":
|
if messages[i]["role"] == "user":
|
||||||
|
@ -897,6 +902,10 @@ def prompt_factory(
|
||||||
return anthropic_pt(messages=messages)
|
return anthropic_pt(messages=messages)
|
||||||
elif "mistral." in model:
|
elif "mistral." in model:
|
||||||
return mistral_instruct_pt(messages=messages)
|
return mistral_instruct_pt(messages=messages)
|
||||||
|
elif custom_llm_provider == "perplexity":
|
||||||
|
for message in messages:
|
||||||
|
message.pop("name", None)
|
||||||
|
return messages
|
||||||
try:
|
try:
|
||||||
if "meta-llama/llama-2" in model and "chat" in model:
|
if "meta-llama/llama-2" in model and "chat" in model:
|
||||||
return llama_2_chat_pt(messages=messages)
|
return llama_2_chat_pt(messages=messages)
|
||||||
|
|
|
@ -488,6 +488,8 @@ def completion(
|
||||||
### ASYNC CALLS ###
|
### ASYNC CALLS ###
|
||||||
acompletion = kwargs.get("acompletion", False)
|
acompletion = kwargs.get("acompletion", False)
|
||||||
client = kwargs.get("client", None)
|
client = kwargs.get("client", None)
|
||||||
|
### Admin Controls ###
|
||||||
|
no_log = kwargs.get("no-log", False)
|
||||||
######## end of unpacking kwargs ###########
|
######## end of unpacking kwargs ###########
|
||||||
openai_params = [
|
openai_params = [
|
||||||
"functions",
|
"functions",
|
||||||
|
@ -564,6 +566,7 @@ def completion(
|
||||||
"caching_groups",
|
"caching_groups",
|
||||||
"ttl",
|
"ttl",
|
||||||
"cache",
|
"cache",
|
||||||
|
"no-log",
|
||||||
]
|
]
|
||||||
default_params = openai_params + litellm_params
|
default_params = openai_params + litellm_params
|
||||||
non_default_params = {
|
non_default_params = {
|
||||||
|
@ -727,6 +730,7 @@ def completion(
|
||||||
model_info=model_info,
|
model_info=model_info,
|
||||||
proxy_server_request=proxy_server_request,
|
proxy_server_request=proxy_server_request,
|
||||||
preset_cache_key=preset_cache_key,
|
preset_cache_key=preset_cache_key,
|
||||||
|
no_log=no_log,
|
||||||
)
|
)
|
||||||
logging.update_environment_variables(
|
logging.update_environment_variables(
|
||||||
model=model,
|
model=model,
|
||||||
|
@ -2418,6 +2422,7 @@ def embedding(
|
||||||
"caching_groups",
|
"caching_groups",
|
||||||
"ttl",
|
"ttl",
|
||||||
"cache",
|
"cache",
|
||||||
|
"no-log",
|
||||||
]
|
]
|
||||||
default_params = openai_params + litellm_params
|
default_params = openai_params + litellm_params
|
||||||
non_default_params = {
|
non_default_params = {
|
||||||
|
|
|
@ -16,6 +16,13 @@ from importlib import resources
|
||||||
import shutil
|
import shutil
|
||||||
|
|
||||||
telemetry = None
|
telemetry = None
|
||||||
|
default_num_workers = 1
|
||||||
|
try:
|
||||||
|
default_num_workers = os.cpu_count() or 1
|
||||||
|
if default_num_workers is not None and default_num_workers > 0:
|
||||||
|
default_num_workers -= 1
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
def append_query_params(url, params):
|
def append_query_params(url, params):
|
||||||
|
@ -57,7 +64,7 @@ def is_port_in_use(port):
|
||||||
@click.option("--port", default=8000, help="Port to bind the server to.", envvar="PORT")
|
@click.option("--port", default=8000, help="Port to bind the server to.", envvar="PORT")
|
||||||
@click.option(
|
@click.option(
|
||||||
"--num_workers",
|
"--num_workers",
|
||||||
default=1,
|
default=default_num_workers,
|
||||||
help="Number of gunicorn workers to spin up",
|
help="Number of gunicorn workers to spin up",
|
||||||
envvar="NUM_WORKERS",
|
envvar="NUM_WORKERS",
|
||||||
)
|
)
|
||||||
|
|
|
@ -5,12 +5,9 @@ model_list:
|
||||||
api_base: os.environ/AZURE_API_BASE
|
api_base: os.environ/AZURE_API_BASE
|
||||||
api_key: os.environ/AZURE_API_KEY
|
api_key: os.environ/AZURE_API_KEY
|
||||||
api_version: "2023-07-01-preview"
|
api_version: "2023-07-01-preview"
|
||||||
- model_name: azure-gpt-3.5
|
litellm_settings:
|
||||||
litellm_params:
|
set_verbose: True
|
||||||
model: gpt-3.5-turbo
|
success_callback: ["langfuse"]
|
||||||
api_key: os.environ/OPENAI_API_KEY
|
|
||||||
model_info:
|
|
||||||
access_groups: ["public"]
|
|
||||||
router_settings:
|
router_settings:
|
||||||
set_verbose: True
|
set_verbose: True
|
||||||
debug_level: "DEBUG"
|
debug_level: "DEBUG"
|
6
litellm/proxy/proxy_load_test/litellm_proxy_config.yaml
Normal file
6
litellm/proxy/proxy_load_test/litellm_proxy_config.yaml
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
model_list:
|
||||||
|
- model_name: gpt-3.5-turbo
|
||||||
|
litellm_params:
|
||||||
|
model: openai/my-fake-model
|
||||||
|
api_key: my-fake-key
|
||||||
|
api_base: http://0.0.0.0:8090
|
27
litellm/proxy/proxy_load_test/locustfile.py
Normal file
27
litellm/proxy/proxy_load_test/locustfile.py
Normal file
|
@ -0,0 +1,27 @@
|
||||||
|
from locust import HttpUser, task, between
|
||||||
|
|
||||||
|
|
||||||
|
class MyUser(HttpUser):
|
||||||
|
wait_time = between(1, 5)
|
||||||
|
|
||||||
|
@task
|
||||||
|
def chat_completion(self):
|
||||||
|
headers = {
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
# Include any additional headers you may need for authentication, etc.
|
||||||
|
}
|
||||||
|
|
||||||
|
# Customize the payload with "model" and "messages" keys
|
||||||
|
payload = {
|
||||||
|
"model": "gpt-3.5-turbo",
|
||||||
|
"messages": [
|
||||||
|
{"role": "system", "content": "You are a chat bot."},
|
||||||
|
{"role": "user", "content": "Hello, how are you?"},
|
||||||
|
],
|
||||||
|
# Add more data as necessary
|
||||||
|
}
|
||||||
|
|
||||||
|
# Make a POST request to the "chat/completions" endpoint
|
||||||
|
response = self.client.post("chat/completions", json=payload, headers=headers)
|
||||||
|
|
||||||
|
# Print or log the response if needed
|
50
litellm/proxy/proxy_load_test/openai_endpoint.py
Normal file
50
litellm/proxy/proxy_load_test/openai_endpoint.py
Normal file
|
@ -0,0 +1,50 @@
|
||||||
|
# import sys, os
|
||||||
|
# sys.path.insert(
|
||||||
|
# 0, os.path.abspath("../")
|
||||||
|
# ) # Adds the parent directory to the system path
|
||||||
|
from fastapi import FastAPI, Request, status, HTTPException, Depends
|
||||||
|
from fastapi.responses import StreamingResponse
|
||||||
|
from fastapi.security import OAuth2PasswordBearer
|
||||||
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
|
|
||||||
|
app = FastAPI()
|
||||||
|
|
||||||
|
app.add_middleware(
|
||||||
|
CORSMiddleware,
|
||||||
|
allow_origins=["*"],
|
||||||
|
allow_credentials=True,
|
||||||
|
allow_methods=["*"],
|
||||||
|
allow_headers=["*"],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# for completion
|
||||||
|
@app.post("/chat/completions")
|
||||||
|
@app.post("/v1/chat/completions")
|
||||||
|
async def completion(request: Request):
|
||||||
|
return {
|
||||||
|
"id": "chatcmpl-123",
|
||||||
|
"object": "chat.completion",
|
||||||
|
"created": 1677652288,
|
||||||
|
"model": "gpt-3.5-turbo-0125",
|
||||||
|
"system_fingerprint": "fp_44709d6fcb",
|
||||||
|
"choices": [
|
||||||
|
{
|
||||||
|
"index": 0,
|
||||||
|
"message": {
|
||||||
|
"role": "assistant",
|
||||||
|
"content": "\n\nHello there, how may I assist you today?",
|
||||||
|
},
|
||||||
|
"logprobs": None,
|
||||||
|
"finish_reason": "stop",
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"usage": {"prompt_tokens": 9, "completion_tokens": 12, "total_tokens": 21},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import uvicorn
|
||||||
|
|
||||||
|
# run this on 8090, 8091, 8092 and 8093
|
||||||
|
uvicorn.run(app, host="0.0.0.0", port=8090)
|
|
@ -1677,9 +1677,9 @@ class ProxyConfig:
|
||||||
# these are litellm callbacks - "langfuse", "sentry", "wandb"
|
# these are litellm callbacks - "langfuse", "sentry", "wandb"
|
||||||
else:
|
else:
|
||||||
litellm.success_callback.append(callback)
|
litellm.success_callback.append(callback)
|
||||||
verbose_proxy_logger.debug(
|
print( # noqa
|
||||||
f"{blue_color_code} Initialized Success Callbacks - {litellm.success_callback} {reset_color_code}"
|
f"{blue_color_code} Initialized Success Callbacks - {litellm.success_callback} {reset_color_code}"
|
||||||
)
|
) # noqa
|
||||||
elif key == "failure_callback":
|
elif key == "failure_callback":
|
||||||
litellm.failure_callback = []
|
litellm.failure_callback = []
|
||||||
|
|
||||||
|
@ -2672,6 +2672,11 @@ async def chat_completion(
|
||||||
except:
|
except:
|
||||||
data = json.loads(body_str)
|
data = json.loads(body_str)
|
||||||
|
|
||||||
|
# Azure OpenAI only: check if user passed api-version
|
||||||
|
query_params = dict(request.query_params)
|
||||||
|
if "api-version" in query_params:
|
||||||
|
data["api_version"] = query_params["api-version"]
|
||||||
|
|
||||||
# Include original request and headers in the data
|
# Include original request and headers in the data
|
||||||
data["proxy_server_request"] = {
|
data["proxy_server_request"] = {
|
||||||
"url": str(request.url),
|
"url": str(request.url),
|
||||||
|
|
|
@ -83,12 +83,13 @@ def test_completion_claude():
|
||||||
|
|
||||||
|
|
||||||
def test_completion_claude_3_empty_response():
|
def test_completion_claude_3_empty_response():
|
||||||
|
litellm.set_verbose = True
|
||||||
messages = [
|
messages = [
|
||||||
{
|
{
|
||||||
"role": "system",
|
"role": "system",
|
||||||
"content": "You are 2twNLGfqk4GMOn3ffp4p.",
|
"content": "You are 2twNLGfqk4GMOn3ffp4p.",
|
||||||
},
|
},
|
||||||
{"role": "user", "content": "Hi gm!"},
|
{"role": "user", "content": "Hi gm!", "name": "ishaan"},
|
||||||
{"role": "assistant", "content": "Good morning! How are you doing today?"},
|
{"role": "assistant", "content": "Good morning! How are you doing today?"},
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
|
|
|
@ -511,7 +511,7 @@ def test_completion_mistral_api_stream():
|
||||||
|
|
||||||
|
|
||||||
def test_completion_deep_infra_stream():
|
def test_completion_deep_infra_stream():
|
||||||
# deep infra currently includes role in the 2nd chunk
|
# deep infra,currently includes role in the 2nd chunk
|
||||||
# waiting for them to make a fix on this
|
# waiting for them to make a fix on this
|
||||||
litellm.set_verbose = True
|
litellm.set_verbose = True
|
||||||
try:
|
try:
|
||||||
|
|
482
litellm/utils.py
482
litellm/utils.py
|
@ -981,6 +981,7 @@ class Logging:
|
||||||
curl_command = self.model_call_details
|
curl_command = self.model_call_details
|
||||||
|
|
||||||
# only print verbose if verbose logger is not set
|
# only print verbose if verbose logger is not set
|
||||||
|
|
||||||
if verbose_logger.level == 0:
|
if verbose_logger.level == 0:
|
||||||
# this means verbose logger was not switched on - user is in litellm.set_verbose=True
|
# this means verbose logger was not switched on - user is in litellm.set_verbose=True
|
||||||
print_verbose(f"\033[92m{curl_command}\033[0m\n")
|
print_verbose(f"\033[92m{curl_command}\033[0m\n")
|
||||||
|
@ -1312,6 +1313,15 @@ class Logging:
|
||||||
|
|
||||||
for callback in callbacks:
|
for callback in callbacks:
|
||||||
try:
|
try:
|
||||||
|
litellm_params = self.model_call_details.get("litellm_params", {})
|
||||||
|
if litellm_params.get("no-log", False) == True:
|
||||||
|
# proxy cost tracking cal backs should run
|
||||||
|
if not (
|
||||||
|
isinstance(callback, CustomLogger)
|
||||||
|
and "_PROXY_" in callback.__class__.__name__
|
||||||
|
):
|
||||||
|
print_verbose("no-log request, skipping logging")
|
||||||
|
continue
|
||||||
if callback == "lite_debugger":
|
if callback == "lite_debugger":
|
||||||
print_verbose("reaches lite_debugger for logging!")
|
print_verbose("reaches lite_debugger for logging!")
|
||||||
print_verbose(f"liteDebuggerClient: {liteDebuggerClient}")
|
print_verbose(f"liteDebuggerClient: {liteDebuggerClient}")
|
||||||
|
@ -1740,7 +1750,20 @@ class Logging:
|
||||||
callbacks = litellm._async_success_callback
|
callbacks = litellm._async_success_callback
|
||||||
verbose_logger.debug(f"Async success callbacks: {callbacks}")
|
verbose_logger.debug(f"Async success callbacks: {callbacks}")
|
||||||
for callback in callbacks:
|
for callback in callbacks:
|
||||||
|
# check if callback can run for this request
|
||||||
|
litellm_params = self.model_call_details.get("litellm_params", {})
|
||||||
|
if litellm_params.get("no-log", False) == True:
|
||||||
|
# proxy cost tracking cal backs should run
|
||||||
|
if not (
|
||||||
|
isinstance(callback, CustomLogger)
|
||||||
|
and "_PROXY_" in callback.__class__.__name__
|
||||||
|
):
|
||||||
|
print_verbose("no-log request, skipping logging")
|
||||||
|
continue
|
||||||
try:
|
try:
|
||||||
|
if kwargs.get("no-log", False) == True:
|
||||||
|
print_verbose("no-log request, skipping logging")
|
||||||
|
continue
|
||||||
if callback == "cache" and litellm.cache is not None:
|
if callback == "cache" and litellm.cache is not None:
|
||||||
# set_cache once complete streaming response is built
|
# set_cache once complete streaming response is built
|
||||||
print_verbose("async success_callback: reaches cache for logging!")
|
print_verbose("async success_callback: reaches cache for logging!")
|
||||||
|
@ -3026,11 +3049,13 @@ def client(original_function):
|
||||||
print_verbose(
|
print_verbose(
|
||||||
f"Async Wrapper: Completed Call, calling async_success_handler: {logging_obj.async_success_handler}"
|
f"Async Wrapper: Completed Call, calling async_success_handler: {logging_obj.async_success_handler}"
|
||||||
)
|
)
|
||||||
|
# check if user does not want this to be logged
|
||||||
asyncio.create_task(
|
asyncio.create_task(
|
||||||
logging_obj.async_success_handler(result, start_time, end_time)
|
logging_obj.async_success_handler(result, start_time, end_time)
|
||||||
)
|
)
|
||||||
threading.Thread(
|
threading.Thread(
|
||||||
target=logging_obj.success_handler, args=(result, start_time, end_time)
|
target=logging_obj.success_handler,
|
||||||
|
args=(result, start_time, end_time),
|
||||||
).start()
|
).start()
|
||||||
|
|
||||||
# RETURN RESULT
|
# RETURN RESULT
|
||||||
|
@ -3933,6 +3958,7 @@ def get_litellm_params(
|
||||||
proxy_server_request=None,
|
proxy_server_request=None,
|
||||||
acompletion=None,
|
acompletion=None,
|
||||||
preset_cache_key=None,
|
preset_cache_key=None,
|
||||||
|
no_log=None,
|
||||||
):
|
):
|
||||||
litellm_params = {
|
litellm_params = {
|
||||||
"acompletion": acompletion,
|
"acompletion": acompletion,
|
||||||
|
@ -3949,6 +3975,7 @@ def get_litellm_params(
|
||||||
"model_info": model_info,
|
"model_info": model_info,
|
||||||
"proxy_server_request": proxy_server_request,
|
"proxy_server_request": proxy_server_request,
|
||||||
"preset_cache_key": preset_cache_key,
|
"preset_cache_key": preset_cache_key,
|
||||||
|
"no-log": no_log,
|
||||||
"stream_response": {}, # litellm_call_id: ModelResponse Dict
|
"stream_response": {}, # litellm_call_id: ModelResponse Dict
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -4269,15 +4296,9 @@ def get_optional_params(
|
||||||
## raise exception if provider doesn't support passed in param
|
## raise exception if provider doesn't support passed in param
|
||||||
if custom_llm_provider == "anthropic":
|
if custom_llm_provider == "anthropic":
|
||||||
## check if unsupported param passed in
|
## check if unsupported param passed in
|
||||||
supported_params = [
|
supported_params = get_supported_openai_params(
|
||||||
"stream",
|
model=model, custom_llm_provider=custom_llm_provider
|
||||||
"stop",
|
)
|
||||||
"temperature",
|
|
||||||
"top_p",
|
|
||||||
"max_tokens",
|
|
||||||
"tools",
|
|
||||||
"tool_choice",
|
|
||||||
]
|
|
||||||
_check_valid_arg(supported_params=supported_params)
|
_check_valid_arg(supported_params=supported_params)
|
||||||
# handle anthropic params
|
# handle anthropic params
|
||||||
if stream:
|
if stream:
|
||||||
|
@ -4301,17 +4322,9 @@ def get_optional_params(
|
||||||
optional_params["tools"] = tools
|
optional_params["tools"] = tools
|
||||||
elif custom_llm_provider == "cohere":
|
elif custom_llm_provider == "cohere":
|
||||||
## check if unsupported param passed in
|
## check if unsupported param passed in
|
||||||
supported_params = [
|
supported_params = get_supported_openai_params(
|
||||||
"stream",
|
model=model, custom_llm_provider=custom_llm_provider
|
||||||
"temperature",
|
)
|
||||||
"max_tokens",
|
|
||||||
"logit_bias",
|
|
||||||
"top_p",
|
|
||||||
"frequency_penalty",
|
|
||||||
"presence_penalty",
|
|
||||||
"stop",
|
|
||||||
"n",
|
|
||||||
]
|
|
||||||
_check_valid_arg(supported_params=supported_params)
|
_check_valid_arg(supported_params=supported_params)
|
||||||
# handle cohere params
|
# handle cohere params
|
||||||
if stream:
|
if stream:
|
||||||
|
@ -4334,14 +4347,9 @@ def get_optional_params(
|
||||||
optional_params["stop_sequences"] = stop
|
optional_params["stop_sequences"] = stop
|
||||||
elif custom_llm_provider == "maritalk":
|
elif custom_llm_provider == "maritalk":
|
||||||
## check if unsupported param passed in
|
## check if unsupported param passed in
|
||||||
supported_params = [
|
supported_params = get_supported_openai_params(
|
||||||
"stream",
|
model=model, custom_llm_provider=custom_llm_provider
|
||||||
"temperature",
|
)
|
||||||
"max_tokens",
|
|
||||||
"top_p",
|
|
||||||
"presence_penalty",
|
|
||||||
"stop",
|
|
||||||
]
|
|
||||||
_check_valid_arg(supported_params=supported_params)
|
_check_valid_arg(supported_params=supported_params)
|
||||||
# handle cohere params
|
# handle cohere params
|
||||||
if stream:
|
if stream:
|
||||||
|
@ -4360,14 +4368,9 @@ def get_optional_params(
|
||||||
optional_params["stopping_tokens"] = stop
|
optional_params["stopping_tokens"] = stop
|
||||||
elif custom_llm_provider == "replicate":
|
elif custom_llm_provider == "replicate":
|
||||||
## check if unsupported param passed in
|
## check if unsupported param passed in
|
||||||
supported_params = [
|
supported_params = get_supported_openai_params(
|
||||||
"stream",
|
model=model, custom_llm_provider=custom_llm_provider
|
||||||
"temperature",
|
)
|
||||||
"max_tokens",
|
|
||||||
"top_p",
|
|
||||||
"stop",
|
|
||||||
"seed",
|
|
||||||
]
|
|
||||||
_check_valid_arg(supported_params=supported_params)
|
_check_valid_arg(supported_params=supported_params)
|
||||||
|
|
||||||
if stream:
|
if stream:
|
||||||
|
@ -4388,7 +4391,9 @@ def get_optional_params(
|
||||||
optional_params["stop_sequences"] = stop
|
optional_params["stop_sequences"] = stop
|
||||||
elif custom_llm_provider == "huggingface":
|
elif custom_llm_provider == "huggingface":
|
||||||
## check if unsupported param passed in
|
## check if unsupported param passed in
|
||||||
supported_params = ["stream", "temperature", "max_tokens", "top_p", "stop", "n"]
|
supported_params = get_supported_openai_params(
|
||||||
|
model=model, custom_llm_provider=custom_llm_provider
|
||||||
|
)
|
||||||
_check_valid_arg(supported_params=supported_params)
|
_check_valid_arg(supported_params=supported_params)
|
||||||
# temperature, top_p, n, stream, stop, max_tokens, n, presence_penalty default to None
|
# temperature, top_p, n, stream, stop, max_tokens, n, presence_penalty default to None
|
||||||
if temperature is not None:
|
if temperature is not None:
|
||||||
|
@ -4427,16 +4432,9 @@ def get_optional_params(
|
||||||
) # since we handle translating echo, we should not send it to TGI request
|
) # since we handle translating echo, we should not send it to TGI request
|
||||||
elif custom_llm_provider == "together_ai":
|
elif custom_llm_provider == "together_ai":
|
||||||
## check if unsupported param passed in
|
## check if unsupported param passed in
|
||||||
supported_params = [
|
supported_params = get_supported_openai_params(
|
||||||
"stream",
|
model=model, custom_llm_provider=custom_llm_provider
|
||||||
"temperature",
|
)
|
||||||
"max_tokens",
|
|
||||||
"top_p",
|
|
||||||
"stop",
|
|
||||||
"frequency_penalty",
|
|
||||||
"tools",
|
|
||||||
"tool_choice",
|
|
||||||
]
|
|
||||||
_check_valid_arg(supported_params=supported_params)
|
_check_valid_arg(supported_params=supported_params)
|
||||||
|
|
||||||
if stream:
|
if stream:
|
||||||
|
@ -4457,16 +4455,9 @@ def get_optional_params(
|
||||||
optional_params["tool_choice"] = tool_choice
|
optional_params["tool_choice"] = tool_choice
|
||||||
elif custom_llm_provider == "ai21":
|
elif custom_llm_provider == "ai21":
|
||||||
## check if unsupported param passed in
|
## check if unsupported param passed in
|
||||||
supported_params = [
|
supported_params = get_supported_openai_params(
|
||||||
"stream",
|
model=model, custom_llm_provider=custom_llm_provider
|
||||||
"n",
|
)
|
||||||
"temperature",
|
|
||||||
"max_tokens",
|
|
||||||
"top_p",
|
|
||||||
"stop",
|
|
||||||
"frequency_penalty",
|
|
||||||
"presence_penalty",
|
|
||||||
]
|
|
||||||
_check_valid_arg(supported_params=supported_params)
|
_check_valid_arg(supported_params=supported_params)
|
||||||
|
|
||||||
if stream:
|
if stream:
|
||||||
|
@ -4489,7 +4480,9 @@ def get_optional_params(
|
||||||
custom_llm_provider == "palm" or custom_llm_provider == "gemini"
|
custom_llm_provider == "palm" or custom_llm_provider == "gemini"
|
||||||
): # https://developers.generativeai.google/tutorials/curl_quickstart
|
): # https://developers.generativeai.google/tutorials/curl_quickstart
|
||||||
## check if unsupported param passed in
|
## check if unsupported param passed in
|
||||||
supported_params = ["temperature", "top_p", "stream", "n", "stop", "max_tokens"]
|
supported_params = get_supported_openai_params(
|
||||||
|
model=model, custom_llm_provider=custom_llm_provider
|
||||||
|
)
|
||||||
_check_valid_arg(supported_params=supported_params)
|
_check_valid_arg(supported_params=supported_params)
|
||||||
|
|
||||||
if temperature is not None:
|
if temperature is not None:
|
||||||
|
@ -4518,14 +4511,9 @@ def get_optional_params(
|
||||||
):
|
):
|
||||||
print_verbose(f"(start) INSIDE THE VERTEX AI OPTIONAL PARAM BLOCK")
|
print_verbose(f"(start) INSIDE THE VERTEX AI OPTIONAL PARAM BLOCK")
|
||||||
## check if unsupported param passed in
|
## check if unsupported param passed in
|
||||||
supported_params = [
|
supported_params = get_supported_openai_params(
|
||||||
"temperature",
|
model=model, custom_llm_provider=custom_llm_provider
|
||||||
"top_p",
|
)
|
||||||
"max_tokens",
|
|
||||||
"stream",
|
|
||||||
"tools",
|
|
||||||
"tool_choice",
|
|
||||||
]
|
|
||||||
_check_valid_arg(supported_params=supported_params)
|
_check_valid_arg(supported_params=supported_params)
|
||||||
|
|
||||||
if temperature is not None:
|
if temperature is not None:
|
||||||
|
@ -4555,7 +4543,9 @@ def get_optional_params(
|
||||||
)
|
)
|
||||||
elif custom_llm_provider == "sagemaker":
|
elif custom_llm_provider == "sagemaker":
|
||||||
## check if unsupported param passed in
|
## check if unsupported param passed in
|
||||||
supported_params = ["stream", "temperature", "max_tokens", "top_p", "stop", "n"]
|
supported_params = get_supported_openai_params(
|
||||||
|
model=model, custom_llm_provider=custom_llm_provider
|
||||||
|
)
|
||||||
_check_valid_arg(supported_params=supported_params)
|
_check_valid_arg(supported_params=supported_params)
|
||||||
# temperature, top_p, n, stream, stop, max_tokens, n, presence_penalty default to None
|
# temperature, top_p, n, stream, stop, max_tokens, n, presence_penalty default to None
|
||||||
if temperature is not None:
|
if temperature is not None:
|
||||||
|
@ -4582,8 +4572,10 @@ def get_optional_params(
|
||||||
max_tokens = 1
|
max_tokens = 1
|
||||||
optional_params["max_new_tokens"] = max_tokens
|
optional_params["max_new_tokens"] = max_tokens
|
||||||
elif custom_llm_provider == "bedrock":
|
elif custom_llm_provider == "bedrock":
|
||||||
|
supported_params = get_supported_openai_params(
|
||||||
|
model=model, custom_llm_provider=custom_llm_provider
|
||||||
|
)
|
||||||
if "ai21" in model:
|
if "ai21" in model:
|
||||||
supported_params = ["max_tokens", "temperature", "top_p", "stream"]
|
|
||||||
_check_valid_arg(supported_params=supported_params)
|
_check_valid_arg(supported_params=supported_params)
|
||||||
# params "maxTokens":200,"temperature":0,"topP":250,"stop_sequences":[],
|
# params "maxTokens":200,"temperature":0,"topP":250,"stop_sequences":[],
|
||||||
# https://us-west-2.console.aws.amazon.com/bedrock/home?region=us-west-2#/providers?model=j2-ultra
|
# https://us-west-2.console.aws.amazon.com/bedrock/home?region=us-west-2#/providers?model=j2-ultra
|
||||||
|
@ -4596,9 +4588,6 @@ def get_optional_params(
|
||||||
if stream:
|
if stream:
|
||||||
optional_params["stream"] = stream
|
optional_params["stream"] = stream
|
||||||
elif "anthropic" in model:
|
elif "anthropic" in model:
|
||||||
supported_params = get_mapped_model_params(
|
|
||||||
model=model, custom_llm_provider=custom_llm_provider
|
|
||||||
)
|
|
||||||
_check_valid_arg(supported_params=supported_params)
|
_check_valid_arg(supported_params=supported_params)
|
||||||
# anthropic params on bedrock
|
# anthropic params on bedrock
|
||||||
# \"max_tokens_to_sample\":300,\"temperature\":0.5,\"top_p\":1,\"stop_sequences\":[\"\\\\n\\\\nHuman:\"]}"
|
# \"max_tokens_to_sample\":300,\"temperature\":0.5,\"top_p\":1,\"stop_sequences\":[\"\\\\n\\\\nHuman:\"]}"
|
||||||
|
@ -4615,7 +4604,6 @@ def get_optional_params(
|
||||||
optional_params=optional_params,
|
optional_params=optional_params,
|
||||||
)
|
)
|
||||||
elif "amazon" in model: # amazon titan llms
|
elif "amazon" in model: # amazon titan llms
|
||||||
supported_params = ["max_tokens", "temperature", "stop", "top_p", "stream"]
|
|
||||||
_check_valid_arg(supported_params=supported_params)
|
_check_valid_arg(supported_params=supported_params)
|
||||||
# see https://us-west-2.console.aws.amazon.com/bedrock/home?region=us-west-2#/providers?model=titan-large
|
# see https://us-west-2.console.aws.amazon.com/bedrock/home?region=us-west-2#/providers?model=titan-large
|
||||||
if max_tokens is not None:
|
if max_tokens is not None:
|
||||||
|
@ -4632,7 +4620,6 @@ def get_optional_params(
|
||||||
if stream:
|
if stream:
|
||||||
optional_params["stream"] = stream
|
optional_params["stream"] = stream
|
||||||
elif "meta" in model: # amazon / meta llms
|
elif "meta" in model: # amazon / meta llms
|
||||||
supported_params = ["max_tokens", "temperature", "top_p", "stream"]
|
|
||||||
_check_valid_arg(supported_params=supported_params)
|
_check_valid_arg(supported_params=supported_params)
|
||||||
# see https://us-west-2.console.aws.amazon.com/bedrock/home?region=us-west-2#/providers?model=titan-large
|
# see https://us-west-2.console.aws.amazon.com/bedrock/home?region=us-west-2#/providers?model=titan-large
|
||||||
if max_tokens is not None:
|
if max_tokens is not None:
|
||||||
|
@ -4644,7 +4631,6 @@ def get_optional_params(
|
||||||
if stream:
|
if stream:
|
||||||
optional_params["stream"] = stream
|
optional_params["stream"] = stream
|
||||||
elif "cohere" in model: # cohere models on bedrock
|
elif "cohere" in model: # cohere models on bedrock
|
||||||
supported_params = ["stream", "temperature", "max_tokens"]
|
|
||||||
_check_valid_arg(supported_params=supported_params)
|
_check_valid_arg(supported_params=supported_params)
|
||||||
# handle cohere params
|
# handle cohere params
|
||||||
if stream:
|
if stream:
|
||||||
|
@ -4654,7 +4640,6 @@ def get_optional_params(
|
||||||
if max_tokens is not None:
|
if max_tokens is not None:
|
||||||
optional_params["max_tokens"] = max_tokens
|
optional_params["max_tokens"] = max_tokens
|
||||||
elif "mistral" in model:
|
elif "mistral" in model:
|
||||||
supported_params = ["max_tokens", "temperature", "stop", "top_p", "stream"]
|
|
||||||
_check_valid_arg(supported_params=supported_params)
|
_check_valid_arg(supported_params=supported_params)
|
||||||
# mistral params on bedrock
|
# mistral params on bedrock
|
||||||
# \"max_tokens\":400,\"temperature\":0.7,\"top_p\":0.7,\"stop\":[\"\\\\n\\\\nHuman:\"]}"
|
# \"max_tokens\":400,\"temperature\":0.7,\"top_p\":0.7,\"stop\":[\"\\\\n\\\\nHuman:\"]}"
|
||||||
|
@ -4698,7 +4683,9 @@ def get_optional_params(
|
||||||
optional_params["stop_sequences"] = stop
|
optional_params["stop_sequences"] = stop
|
||||||
elif custom_llm_provider == "cloudflare":
|
elif custom_llm_provider == "cloudflare":
|
||||||
# https://developers.cloudflare.com/workers-ai/models/text-generation/#input
|
# https://developers.cloudflare.com/workers-ai/models/text-generation/#input
|
||||||
supported_params = ["max_tokens", "stream"]
|
supported_params = get_supported_openai_params(
|
||||||
|
model=model, custom_llm_provider=custom_llm_provider
|
||||||
|
)
|
||||||
_check_valid_arg(supported_params=supported_params)
|
_check_valid_arg(supported_params=supported_params)
|
||||||
|
|
||||||
if max_tokens is not None:
|
if max_tokens is not None:
|
||||||
|
@ -4706,14 +4693,9 @@ def get_optional_params(
|
||||||
if stream is not None:
|
if stream is not None:
|
||||||
optional_params["stream"] = stream
|
optional_params["stream"] = stream
|
||||||
elif custom_llm_provider == "ollama":
|
elif custom_llm_provider == "ollama":
|
||||||
supported_params = [
|
supported_params = get_supported_openai_params(
|
||||||
"max_tokens",
|
model=model, custom_llm_provider=custom_llm_provider
|
||||||
"stream",
|
)
|
||||||
"top_p",
|
|
||||||
"temperature",
|
|
||||||
"frequency_penalty",
|
|
||||||
"stop",
|
|
||||||
]
|
|
||||||
_check_valid_arg(supported_params=supported_params)
|
_check_valid_arg(supported_params=supported_params)
|
||||||
|
|
||||||
if max_tokens is not None:
|
if max_tokens is not None:
|
||||||
|
@ -4737,16 +4719,9 @@ def get_optional_params(
|
||||||
non_default_params=non_default_params, optional_params=optional_params
|
non_default_params=non_default_params, optional_params=optional_params
|
||||||
)
|
)
|
||||||
elif custom_llm_provider == "nlp_cloud":
|
elif custom_llm_provider == "nlp_cloud":
|
||||||
supported_params = [
|
supported_params = get_supported_openai_params(
|
||||||
"max_tokens",
|
model=model, custom_llm_provider=custom_llm_provider
|
||||||
"stream",
|
)
|
||||||
"temperature",
|
|
||||||
"top_p",
|
|
||||||
"presence_penalty",
|
|
||||||
"frequency_penalty",
|
|
||||||
"n",
|
|
||||||
"stop",
|
|
||||||
]
|
|
||||||
_check_valid_arg(supported_params=supported_params)
|
_check_valid_arg(supported_params=supported_params)
|
||||||
|
|
||||||
if max_tokens is not None:
|
if max_tokens is not None:
|
||||||
|
@ -4766,7 +4741,9 @@ def get_optional_params(
|
||||||
if stop is not None:
|
if stop is not None:
|
||||||
optional_params["stop_sequences"] = stop
|
optional_params["stop_sequences"] = stop
|
||||||
elif custom_llm_provider == "petals":
|
elif custom_llm_provider == "petals":
|
||||||
supported_params = ["max_tokens", "temperature", "top_p", "stream"]
|
supported_params = get_supported_openai_params(
|
||||||
|
model=model, custom_llm_provider=custom_llm_provider
|
||||||
|
)
|
||||||
_check_valid_arg(supported_params=supported_params)
|
_check_valid_arg(supported_params=supported_params)
|
||||||
# max_new_tokens=1,temperature=0.9, top_p=0.6
|
# max_new_tokens=1,temperature=0.9, top_p=0.6
|
||||||
if max_tokens is not None:
|
if max_tokens is not None:
|
||||||
|
@ -4778,18 +4755,9 @@ def get_optional_params(
|
||||||
if stream:
|
if stream:
|
||||||
optional_params["stream"] = stream
|
optional_params["stream"] = stream
|
||||||
elif custom_llm_provider == "deepinfra":
|
elif custom_llm_provider == "deepinfra":
|
||||||
supported_params = [
|
supported_params = get_supported_openai_params(
|
||||||
"temperature",
|
model=model, custom_llm_provider=custom_llm_provider
|
||||||
"top_p",
|
)
|
||||||
"n",
|
|
||||||
"stream",
|
|
||||||
"stop",
|
|
||||||
"max_tokens",
|
|
||||||
"presence_penalty",
|
|
||||||
"frequency_penalty",
|
|
||||||
"logit_bias",
|
|
||||||
"user",
|
|
||||||
]
|
|
||||||
_check_valid_arg(supported_params=supported_params)
|
_check_valid_arg(supported_params=supported_params)
|
||||||
if temperature is not None:
|
if temperature is not None:
|
||||||
if (
|
if (
|
||||||
|
@ -4816,14 +4784,9 @@ def get_optional_params(
|
||||||
if user:
|
if user:
|
||||||
optional_params["user"] = user
|
optional_params["user"] = user
|
||||||
elif custom_llm_provider == "perplexity":
|
elif custom_llm_provider == "perplexity":
|
||||||
supported_params = [
|
supported_params = get_supported_openai_params(
|
||||||
"temperature",
|
model=model, custom_llm_provider=custom_llm_provider
|
||||||
"top_p",
|
)
|
||||||
"stream",
|
|
||||||
"max_tokens",
|
|
||||||
"presence_penalty",
|
|
||||||
"frequency_penalty",
|
|
||||||
]
|
|
||||||
_check_valid_arg(supported_params=supported_params)
|
_check_valid_arg(supported_params=supported_params)
|
||||||
if temperature is not None:
|
if temperature is not None:
|
||||||
if (
|
if (
|
||||||
|
@ -4842,15 +4805,9 @@ def get_optional_params(
|
||||||
if frequency_penalty:
|
if frequency_penalty:
|
||||||
optional_params["frequency_penalty"] = frequency_penalty
|
optional_params["frequency_penalty"] = frequency_penalty
|
||||||
elif custom_llm_provider == "anyscale":
|
elif custom_llm_provider == "anyscale":
|
||||||
supported_params = [
|
supported_params = get_supported_openai_params(
|
||||||
"temperature",
|
model=model, custom_llm_provider=custom_llm_provider
|
||||||
"top_p",
|
)
|
||||||
"stream",
|
|
||||||
"max_tokens",
|
|
||||||
"stop",
|
|
||||||
"frequency_penalty",
|
|
||||||
"presence_penalty",
|
|
||||||
]
|
|
||||||
if model in [
|
if model in [
|
||||||
"mistralai/Mistral-7B-Instruct-v0.1",
|
"mistralai/Mistral-7B-Instruct-v0.1",
|
||||||
"mistralai/Mixtral-8x7B-Instruct-v0.1",
|
"mistralai/Mixtral-8x7B-Instruct-v0.1",
|
||||||
|
@ -4878,14 +4835,9 @@ def get_optional_params(
|
||||||
if max_tokens:
|
if max_tokens:
|
||||||
optional_params["max_tokens"] = max_tokens
|
optional_params["max_tokens"] = max_tokens
|
||||||
elif custom_llm_provider == "mistral":
|
elif custom_llm_provider == "mistral":
|
||||||
supported_params = [
|
supported_params = get_supported_openai_params(
|
||||||
"temperature",
|
model=model, custom_llm_provider=custom_llm_provider
|
||||||
"top_p",
|
)
|
||||||
"stream",
|
|
||||||
"max_tokens",
|
|
||||||
"tools",
|
|
||||||
"tool_choice",
|
|
||||||
]
|
|
||||||
_check_valid_arg(supported_params=supported_params)
|
_check_valid_arg(supported_params=supported_params)
|
||||||
if temperature is not None:
|
if temperature is not None:
|
||||||
optional_params["temperature"] = temperature
|
optional_params["temperature"] = temperature
|
||||||
|
@ -4912,25 +4864,9 @@ def get_optional_params(
|
||||||
extra_body # openai client supports `extra_body` param
|
extra_body # openai client supports `extra_body` param
|
||||||
)
|
)
|
||||||
elif custom_llm_provider == "openrouter":
|
elif custom_llm_provider == "openrouter":
|
||||||
supported_params = [
|
supported_params = get_supported_openai_params(
|
||||||
"functions",
|
model=model, custom_llm_provider=custom_llm_provider
|
||||||
"function_call",
|
)
|
||||||
"temperature",
|
|
||||||
"top_p",
|
|
||||||
"n",
|
|
||||||
"stream",
|
|
||||||
"stop",
|
|
||||||
"max_tokens",
|
|
||||||
"presence_penalty",
|
|
||||||
"frequency_penalty",
|
|
||||||
"logit_bias",
|
|
||||||
"user",
|
|
||||||
"response_format",
|
|
||||||
"seed",
|
|
||||||
"tools",
|
|
||||||
"tool_choice",
|
|
||||||
"max_retries",
|
|
||||||
]
|
|
||||||
_check_valid_arg(supported_params=supported_params)
|
_check_valid_arg(supported_params=supported_params)
|
||||||
|
|
||||||
if functions is not None:
|
if functions is not None:
|
||||||
|
@ -4984,28 +4920,9 @@ def get_optional_params(
|
||||||
)
|
)
|
||||||
else: # assume passing in params for openai/azure openai
|
else: # assume passing in params for openai/azure openai
|
||||||
print_verbose(f"UNMAPPED PROVIDER, ASSUMING IT'S OPENAI/AZURE")
|
print_verbose(f"UNMAPPED PROVIDER, ASSUMING IT'S OPENAI/AZURE")
|
||||||
supported_params = [
|
supported_params = get_supported_openai_params(
|
||||||
"functions",
|
model=model, custom_llm_provider="openai"
|
||||||
"function_call",
|
)
|
||||||
"temperature",
|
|
||||||
"top_p",
|
|
||||||
"n",
|
|
||||||
"stream",
|
|
||||||
"stop",
|
|
||||||
"max_tokens",
|
|
||||||
"presence_penalty",
|
|
||||||
"frequency_penalty",
|
|
||||||
"logit_bias",
|
|
||||||
"user",
|
|
||||||
"response_format",
|
|
||||||
"seed",
|
|
||||||
"tools",
|
|
||||||
"tool_choice",
|
|
||||||
"max_retries",
|
|
||||||
"logprobs",
|
|
||||||
"top_logprobs",
|
|
||||||
"extra_headers",
|
|
||||||
]
|
|
||||||
_check_valid_arg(supported_params=supported_params)
|
_check_valid_arg(supported_params=supported_params)
|
||||||
if functions is not None:
|
if functions is not None:
|
||||||
optional_params["functions"] = functions
|
optional_params["functions"] = functions
|
||||||
|
@ -5063,15 +4980,228 @@ def get_optional_params(
|
||||||
return optional_params
|
return optional_params
|
||||||
|
|
||||||
|
|
||||||
def get_mapped_model_params(model: str, custom_llm_provider: str):
|
def get_supported_openai_params(model: str, custom_llm_provider: str):
|
||||||
"""
|
"""
|
||||||
Returns the supported openai params for a given model + provider
|
Returns the supported openai params for a given model + provider
|
||||||
|
|
||||||
|
Example:
|
||||||
|
```
|
||||||
|
get_supported_openai_params(model="anthropic.claude-3", custom_llm_provider="bedrock")
|
||||||
|
```
|
||||||
"""
|
"""
|
||||||
if custom_llm_provider == "bedrock":
|
if custom_llm_provider == "bedrock":
|
||||||
if model.startswith("anthropic.claude-3"):
|
if model.startswith("anthropic.claude-3"):
|
||||||
return litellm.AmazonAnthropicClaude3Config().get_supported_openai_params()
|
return litellm.AmazonAnthropicClaude3Config().get_supported_openai_params()
|
||||||
else:
|
elif model.startswith("anthropic"):
|
||||||
return litellm.AmazonAnthropicConfig().get_supported_openai_params()
|
return litellm.AmazonAnthropicConfig().get_supported_openai_params()
|
||||||
|
elif model.startswith("ai21"):
|
||||||
|
return ["max_tokens", "temperature", "top_p", "stream"]
|
||||||
|
elif model.startswith("amazon"):
|
||||||
|
return ["max_tokens", "temperature", "stop", "top_p", "stream"]
|
||||||
|
elif model.startswith("meta"):
|
||||||
|
return ["max_tokens", "temperature", "top_p", "stream"]
|
||||||
|
elif model.startswith("cohere"):
|
||||||
|
return ["stream", "temperature", "max_tokens"]
|
||||||
|
elif model.startswith("mistral"):
|
||||||
|
return ["max_tokens", "temperature", "stop", "top_p", "stream"]
|
||||||
|
elif custom_llm_provider == "ollama_chat":
|
||||||
|
return litellm.OllamaChatConfig().get_supported_openai_params()
|
||||||
|
elif custom_llm_provider == "anthropic":
|
||||||
|
return [
|
||||||
|
"stream",
|
||||||
|
"stop",
|
||||||
|
"temperature",
|
||||||
|
"top_p",
|
||||||
|
"max_tokens",
|
||||||
|
"tools",
|
||||||
|
"tool_choice",
|
||||||
|
]
|
||||||
|
elif custom_llm_provider == "cohere":
|
||||||
|
return [
|
||||||
|
"stream",
|
||||||
|
"temperature",
|
||||||
|
"max_tokens",
|
||||||
|
"logit_bias",
|
||||||
|
"top_p",
|
||||||
|
"frequency_penalty",
|
||||||
|
"presence_penalty",
|
||||||
|
"stop",
|
||||||
|
"n",
|
||||||
|
]
|
||||||
|
elif custom_llm_provider == "maritalk":
|
||||||
|
return [
|
||||||
|
"stream",
|
||||||
|
"temperature",
|
||||||
|
"max_tokens",
|
||||||
|
"top_p",
|
||||||
|
"presence_penalty",
|
||||||
|
"stop",
|
||||||
|
]
|
||||||
|
elif custom_llm_provider == "openai" or custom_llm_provider == "azure":
|
||||||
|
return [
|
||||||
|
"functions",
|
||||||
|
"function_call",
|
||||||
|
"temperature",
|
||||||
|
"top_p",
|
||||||
|
"n",
|
||||||
|
"stream",
|
||||||
|
"stop",
|
||||||
|
"max_tokens",
|
||||||
|
"presence_penalty",
|
||||||
|
"frequency_penalty",
|
||||||
|
"logit_bias",
|
||||||
|
"user",
|
||||||
|
"response_format",
|
||||||
|
"seed",
|
||||||
|
"tools",
|
||||||
|
"tool_choice",
|
||||||
|
"max_retries",
|
||||||
|
"logprobs",
|
||||||
|
"top_logprobs",
|
||||||
|
"extra_headers",
|
||||||
|
]
|
||||||
|
elif custom_llm_provider == "openrouter":
|
||||||
|
return [
|
||||||
|
"functions",
|
||||||
|
"function_call",
|
||||||
|
"temperature",
|
||||||
|
"top_p",
|
||||||
|
"n",
|
||||||
|
"stream",
|
||||||
|
"stop",
|
||||||
|
"max_tokens",
|
||||||
|
"presence_penalty",
|
||||||
|
"frequency_penalty",
|
||||||
|
"logit_bias",
|
||||||
|
"user",
|
||||||
|
"response_format",
|
||||||
|
"seed",
|
||||||
|
"tools",
|
||||||
|
"tool_choice",
|
||||||
|
"max_retries",
|
||||||
|
]
|
||||||
|
elif custom_llm_provider == "mistral":
|
||||||
|
return [
|
||||||
|
"temperature",
|
||||||
|
"top_p",
|
||||||
|
"stream",
|
||||||
|
"max_tokens",
|
||||||
|
"tools",
|
||||||
|
"tool_choice",
|
||||||
|
]
|
||||||
|
elif custom_llm_provider == "replicate":
|
||||||
|
return [
|
||||||
|
"stream",
|
||||||
|
"temperature",
|
||||||
|
"max_tokens",
|
||||||
|
"top_p",
|
||||||
|
"stop",
|
||||||
|
"seed",
|
||||||
|
]
|
||||||
|
elif custom_llm_provider == "huggingface":
|
||||||
|
return ["stream", "temperature", "max_tokens", "top_p", "stop", "n"]
|
||||||
|
elif custom_llm_provider == "together_ai":
|
||||||
|
return [
|
||||||
|
"stream",
|
||||||
|
"temperature",
|
||||||
|
"max_tokens",
|
||||||
|
"top_p",
|
||||||
|
"stop",
|
||||||
|
"frequency_penalty",
|
||||||
|
"tools",
|
||||||
|
"tool_choice",
|
||||||
|
]
|
||||||
|
elif custom_llm_provider == "ai21":
|
||||||
|
return [
|
||||||
|
"stream",
|
||||||
|
"n",
|
||||||
|
"temperature",
|
||||||
|
"max_tokens",
|
||||||
|
"top_p",
|
||||||
|
"stop",
|
||||||
|
"frequency_penalty",
|
||||||
|
"presence_penalty",
|
||||||
|
]
|
||||||
|
elif custom_llm_provider == "palm" or custom_llm_provider == "gemini":
|
||||||
|
return ["temperature", "top_p", "stream", "n", "stop", "max_tokens"]
|
||||||
|
elif custom_llm_provider == "vertex_ai":
|
||||||
|
return [
|
||||||
|
"temperature",
|
||||||
|
"top_p",
|
||||||
|
"max_tokens",
|
||||||
|
"stream",
|
||||||
|
"tools",
|
||||||
|
"tool_choice",
|
||||||
|
]
|
||||||
|
elif custom_llm_provider == "sagemaker":
|
||||||
|
return ["stream", "temperature", "max_tokens", "top_p", "stop", "n"]
|
||||||
|
elif custom_llm_provider == "aleph_alpha":
|
||||||
|
return [
|
||||||
|
"max_tokens",
|
||||||
|
"stream",
|
||||||
|
"top_p",
|
||||||
|
"temperature",
|
||||||
|
"presence_penalty",
|
||||||
|
"frequency_penalty",
|
||||||
|
"n",
|
||||||
|
"stop",
|
||||||
|
]
|
||||||
|
elif custom_llm_provider == "cloudflare":
|
||||||
|
return ["max_tokens", "stream"]
|
||||||
|
elif custom_llm_provider == "ollama":
|
||||||
|
return [
|
||||||
|
"max_tokens",
|
||||||
|
"stream",
|
||||||
|
"top_p",
|
||||||
|
"temperature",
|
||||||
|
"frequency_penalty",
|
||||||
|
"stop",
|
||||||
|
]
|
||||||
|
elif custom_llm_provider == "nlp_cloud":
|
||||||
|
return [
|
||||||
|
"max_tokens",
|
||||||
|
"stream",
|
||||||
|
"temperature",
|
||||||
|
"top_p",
|
||||||
|
"presence_penalty",
|
||||||
|
"frequency_penalty",
|
||||||
|
"n",
|
||||||
|
"stop",
|
||||||
|
]
|
||||||
|
elif custom_llm_provider == "petals":
|
||||||
|
return ["max_tokens", "temperature", "top_p", "stream"]
|
||||||
|
elif custom_llm_provider == "deepinfra":
|
||||||
|
return [
|
||||||
|
"temperature",
|
||||||
|
"top_p",
|
||||||
|
"n",
|
||||||
|
"stream",
|
||||||
|
"stop",
|
||||||
|
"max_tokens",
|
||||||
|
"presence_penalty",
|
||||||
|
"frequency_penalty",
|
||||||
|
"logit_bias",
|
||||||
|
"user",
|
||||||
|
]
|
||||||
|
elif custom_llm_provider == "perplexity":
|
||||||
|
return [
|
||||||
|
"temperature",
|
||||||
|
"top_p",
|
||||||
|
"stream",
|
||||||
|
"max_tokens",
|
||||||
|
"presence_penalty",
|
||||||
|
"frequency_penalty",
|
||||||
|
]
|
||||||
|
elif custom_llm_provider == "anyscale":
|
||||||
|
return [
|
||||||
|
"temperature",
|
||||||
|
"top_p",
|
||||||
|
"stream",
|
||||||
|
"max_tokens",
|
||||||
|
"stop",
|
||||||
|
"frequency_penalty",
|
||||||
|
"presence_penalty",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
def get_llm_provider(
|
def get_llm_provider(
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "litellm"
|
name = "litellm"
|
||||||
version = "1.30.3"
|
version = "1.30.4"
|
||||||
description = "Library to easily interface with LLM API providers"
|
description = "Library to easily interface with LLM API providers"
|
||||||
authors = ["BerriAI"]
|
authors = ["BerriAI"]
|
||||||
license = "MIT"
|
license = "MIT"
|
||||||
|
@ -74,7 +74,7 @@ requires = ["poetry-core", "wheel"]
|
||||||
build-backend = "poetry.core.masonry.api"
|
build-backend = "poetry.core.masonry.api"
|
||||||
|
|
||||||
[tool.commitizen]
|
[tool.commitizen]
|
||||||
version = "1.30.3"
|
version = "1.30.4"
|
||||||
version_files = [
|
version_files = [
|
||||||
"pyproject.toml:^version"
|
"pyproject.toml:^version"
|
||||||
]
|
]
|
||||||
|
|
|
@ -9,6 +9,7 @@ import sys, os, dotenv
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
|
# Get the current directory of the file being run
|
||||||
pwd = os.path.dirname(os.path.realpath(__file__))
|
pwd = os.path.dirname(os.path.realpath(__file__))
|
||||||
print(pwd)
|
print(pwd)
|
||||||
|
|
||||||
|
@ -37,12 +38,13 @@ def test_transcription():
|
||||||
|
|
||||||
|
|
||||||
def test_transcription_azure():
|
def test_transcription_azure():
|
||||||
|
litellm.set_verbose = True
|
||||||
transcript = litellm.transcription(
|
transcript = litellm.transcription(
|
||||||
model="azure/azure-whisper",
|
model="azure/azure-whisper",
|
||||||
file=audio_file,
|
file=audio_file,
|
||||||
api_base=os.getenv("AZURE_EUROPE_API_BASE"),
|
api_base="https://my-endpoint-europe-berri-992.openai.azure.com/",
|
||||||
api_key=os.getenv("AZURE_EUROPE_API_KEY"),
|
api_key=os.getenv("AZURE_EUROPE_API_KEY"),
|
||||||
api_version=os.getenv("2024-02-15-preview"),
|
api_version="2024-02-15-preview",
|
||||||
)
|
)
|
||||||
|
|
||||||
assert transcript.text is not None
|
assert transcript.text is not None
|
||||||
|
@ -57,9 +59,9 @@ async def test_transcription_async_azure():
|
||||||
transcript = await litellm.atranscription(
|
transcript = await litellm.atranscription(
|
||||||
model="azure/azure-whisper",
|
model="azure/azure-whisper",
|
||||||
file=audio_file,
|
file=audio_file,
|
||||||
api_base=os.getenv("AZURE_EUROPE_API_BASE"),
|
api_base="https://my-endpoint-europe-berri-992.openai.azure.com/",
|
||||||
api_key=os.getenv("AZURE_EUROPE_API_KEY"),
|
api_key=os.getenv("AZURE_EUROPE_API_KEY"),
|
||||||
api_version=os.getenv("2024-02-15-preview"),
|
api_version="2024-02-15-preview",
|
||||||
)
|
)
|
||||||
|
|
||||||
assert transcript.text is not None
|
assert transcript.text is not None
|
||||||
|
@ -96,7 +98,7 @@ async def test_transcription_on_router():
|
||||||
"model_name": "whisper",
|
"model_name": "whisper",
|
||||||
"litellm_params": {
|
"litellm_params": {
|
||||||
"model": "azure/azure-whisper",
|
"model": "azure/azure-whisper",
|
||||||
"api_base": os.getenv("AZURE_EUROPE_API_BASE"),
|
"api_base": "https://my-endpoint-europe-berri-992.openai.azure.com/",
|
||||||
"api_key": os.getenv("AZURE_EUROPE_API_KEY"),
|
"api_key": os.getenv("AZURE_EUROPE_API_KEY"),
|
||||||
"api_version": "2024-02-15-preview",
|
"api_version": "2024-02-15-preview",
|
||||||
},
|
},
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue