forked from phoenix/litellm-mirror
added changes from upstream
Merge branch 'main' into fix/error-on-get-user-role
This commit is contained in:
commit
d3a228d03b
142 changed files with 4439 additions and 801 deletions
|
@ -188,7 +188,7 @@ jobs:
|
|||
command: |
|
||||
docker run -d \
|
||||
-p 4000:4000 \
|
||||
-e DATABASE_URL=$PROXY_DOCKER_DB_URL \
|
||||
-e DATABASE_URL=$PROXY_DATABASE_URL \
|
||||
-e AZURE_API_KEY=$AZURE_API_KEY \
|
||||
-e REDIS_HOST=$REDIS_HOST \
|
||||
-e REDIS_PASSWORD=$REDIS_PASSWORD \
|
||||
|
@ -223,7 +223,7 @@ jobs:
|
|||
background: true
|
||||
- run:
|
||||
name: Wait for app to be ready
|
||||
command: dockerize -wait http://localhost:4000 -timeout 1m
|
||||
command: dockerize -wait http://localhost:4000 -timeout 5m
|
||||
- run:
|
||||
name: Run tests
|
||||
command: |
|
||||
|
|
51
.devcontainer/devcontainer.json
Normal file
51
.devcontainer/devcontainer.json
Normal file
|
@ -0,0 +1,51 @@
|
|||
{
|
||||
"name": "Python 3.11",
|
||||
// Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile
|
||||
"image": "mcr.microsoft.com/devcontainers/python:3.11-bookworm",
|
||||
// https://github.com/devcontainers/images/tree/main/src/python
|
||||
// https://mcr.microsoft.com/en-us/product/devcontainers/python/tags
|
||||
|
||||
// "build": {
|
||||
// "dockerfile": "Dockerfile",
|
||||
// "context": ".."
|
||||
// },
|
||||
|
||||
// Features to add to the dev container. More info: https://containers.dev/features.
|
||||
// "features": {},
|
||||
|
||||
// Configure tool-specific properties.
|
||||
"customizations": {
|
||||
// Configure properties specific to VS Code.
|
||||
"vscode": {
|
||||
"settings": {},
|
||||
"extensions": [
|
||||
"ms-python.python",
|
||||
"ms-python.vscode-pylance",
|
||||
"GitHub.copilot",
|
||||
"GitHub.copilot-chat"
|
||||
]
|
||||
}
|
||||
},
|
||||
|
||||
// Use 'forwardPorts' to make a list of ports inside the container available locally.
|
||||
"forwardPorts": [4000],
|
||||
|
||||
"containerEnv": {
|
||||
"LITELLM_LOG": "DEBUG"
|
||||
},
|
||||
|
||||
// Use 'portsAttributes' to set default properties for specific forwarded ports.
|
||||
// More info: https://containers.dev/implementors/json_reference/#port-attributes
|
||||
"portsAttributes": {
|
||||
"4000": {
|
||||
"label": "LiteLLM Server",
|
||||
"onAutoForward": "notify"
|
||||
}
|
||||
},
|
||||
|
||||
// More info: https://aka.ms/dev-containers-non-root.
|
||||
// "remoteUser": "litellm",
|
||||
|
||||
// Use 'postCreateCommand' to run commands after the container is created.
|
||||
"postCreateCommand": "pipx install poetry && poetry install -E extra_proxy -E proxy"
|
||||
}
|
19
.github/workflows/interpret_load_test.py
vendored
19
.github/workflows/interpret_load_test.py
vendored
|
@ -64,6 +64,11 @@ if __name__ == "__main__":
|
|||
) # Replace with your repository's username and name
|
||||
latest_release = repo.get_latest_release()
|
||||
print("got latest release: ", latest_release)
|
||||
print(latest_release.title)
|
||||
print(latest_release.tag_name)
|
||||
|
||||
release_version = latest_release.title
|
||||
|
||||
print("latest release body: ", latest_release.body)
|
||||
print("markdown table: ", markdown_table)
|
||||
|
||||
|
@ -74,8 +79,22 @@ if __name__ == "__main__":
|
|||
start_index = latest_release.body.find("Load Test LiteLLM Proxy Results")
|
||||
existing_release_body = latest_release.body[:start_index]
|
||||
|
||||
docker_run_command = f"""
|
||||
\n\n
|
||||
## Docker Run LiteLLM Proxy
|
||||
|
||||
```
|
||||
docker run \\
|
||||
-e STORE_MODEL_IN_DB=True \\
|
||||
-p 4000:4000 \\
|
||||
ghcr.io/berriai/litellm:main-{release_version}
|
||||
```
|
||||
"""
|
||||
print("docker run command: ", docker_run_command)
|
||||
|
||||
new_release_body = (
|
||||
existing_release_body
|
||||
+ docker_run_command
|
||||
+ "\n\n"
|
||||
+ "### Don't want to maintain your internal proxy? get in touch 🎉"
|
||||
+ "\nHosted Proxy Alpha: https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat"
|
||||
|
|
4
.gitignore
vendored
4
.gitignore
vendored
|
@ -1,5 +1,6 @@
|
|||
.venv
|
||||
.env
|
||||
litellm/proxy/myenv/*
|
||||
litellm_uuid.txt
|
||||
__pycache__/
|
||||
*.pyc
|
||||
|
@ -52,3 +53,6 @@ litellm/proxy/_new_secret_config.yaml
|
|||
litellm/proxy/_new_secret_config.yaml
|
||||
litellm/proxy/_super_secret_config.yaml
|
||||
litellm/proxy/_super_secret_config.yaml
|
||||
litellm/proxy/myenv/bin/activate
|
||||
litellm/proxy/myenv/bin/Activate.ps1
|
||||
myenv/*
|
|
@ -16,11 +16,11 @@ repos:
|
|||
name: Check if files match
|
||||
entry: python3 ci_cd/check_files_match.py
|
||||
language: system
|
||||
- repo: local
|
||||
hooks:
|
||||
- id: mypy
|
||||
name: mypy
|
||||
entry: python3 -m mypy --ignore-missing-imports
|
||||
language: system
|
||||
types: [python]
|
||||
files: ^litellm/
|
||||
# - repo: local
|
||||
# hooks:
|
||||
# - id: mypy
|
||||
# name: mypy
|
||||
# entry: python3 -m mypy --ignore-missing-imports
|
||||
# language: system
|
||||
# types: [python]
|
||||
# files: ^litellm/
|
|
@ -226,6 +226,7 @@ curl 'http://0.0.0.0:4000/key/generate' \
|
|||
| [deepinfra](https://docs.litellm.ai/docs/providers/deepinfra) | ✅ | ✅ | ✅ | ✅ |
|
||||
| [perplexity-ai](https://docs.litellm.ai/docs/providers/perplexity) | ✅ | ✅ | ✅ | ✅ |
|
||||
| [Groq AI](https://docs.litellm.ai/docs/providers/groq) | ✅ | ✅ | ✅ | ✅ |
|
||||
| [Deepseek](https://docs.litellm.ai/docs/providers/deepseek) | ✅ | ✅ | ✅ | ✅ |
|
||||
| [anyscale](https://docs.litellm.ai/docs/providers/anyscale) | ✅ | ✅ | ✅ | ✅ |
|
||||
| [IBM - watsonx.ai](https://docs.litellm.ai/docs/providers/watsonx) | ✅ | ✅ | ✅ | ✅ | ✅
|
||||
| [voyage ai](https://docs.litellm.ai/docs/providers/voyage) | | | | | ✅ |
|
||||
|
|
BIN
deploy/azure_resource_manager/azure_marketplace.zip
Normal file
BIN
deploy/azure_resource_manager/azure_marketplace.zip
Normal file
Binary file not shown.
|
@ -0,0 +1,15 @@
|
|||
{
|
||||
"$schema": "https://schema.management.azure.com/schemas/0.1.2-preview/CreateUIDefinition.MultiVm.json#",
|
||||
"handler": "Microsoft.Azure.CreateUIDef",
|
||||
"version": "0.1.2-preview",
|
||||
"parameters": {
|
||||
"config": {
|
||||
"isWizard": false,
|
||||
"basics": { }
|
||||
},
|
||||
"basics": [ ],
|
||||
"steps": [ ],
|
||||
"outputs": { },
|
||||
"resourceTypes": [ ]
|
||||
}
|
||||
}
|
|
@ -0,0 +1,63 @@
|
|||
{
|
||||
"$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#",
|
||||
"contentVersion": "1.0.0.0",
|
||||
"parameters": {
|
||||
"imageName": {
|
||||
"type": "string",
|
||||
"defaultValue": "ghcr.io/berriai/litellm:main-latest"
|
||||
},
|
||||
"containerName": {
|
||||
"type": "string",
|
||||
"defaultValue": "litellm-container"
|
||||
},
|
||||
"dnsLabelName": {
|
||||
"type": "string",
|
||||
"defaultValue": "litellm"
|
||||
},
|
||||
"portNumber": {
|
||||
"type": "int",
|
||||
"defaultValue": 4000
|
||||
}
|
||||
},
|
||||
"resources": [
|
||||
{
|
||||
"type": "Microsoft.ContainerInstance/containerGroups",
|
||||
"apiVersion": "2021-03-01",
|
||||
"name": "[parameters('containerName')]",
|
||||
"location": "[resourceGroup().location]",
|
||||
"properties": {
|
||||
"containers": [
|
||||
{
|
||||
"name": "[parameters('containerName')]",
|
||||
"properties": {
|
||||
"image": "[parameters('imageName')]",
|
||||
"resources": {
|
||||
"requests": {
|
||||
"cpu": 1,
|
||||
"memoryInGB": 2
|
||||
}
|
||||
},
|
||||
"ports": [
|
||||
{
|
||||
"port": "[parameters('portNumber')]"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
],
|
||||
"osType": "Linux",
|
||||
"restartPolicy": "Always",
|
||||
"ipAddress": {
|
||||
"type": "Public",
|
||||
"ports": [
|
||||
{
|
||||
"protocol": "tcp",
|
||||
"port": "[parameters('portNumber')]"
|
||||
}
|
||||
],
|
||||
"dnsNameLabel": "[parameters('dnsLabelName')]"
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
42
deploy/azure_resource_manager/main.bicep
Normal file
42
deploy/azure_resource_manager/main.bicep
Normal file
|
@ -0,0 +1,42 @@
|
|||
param imageName string = 'ghcr.io/berriai/litellm:main-latest'
|
||||
param containerName string = 'litellm-container'
|
||||
param dnsLabelName string = 'litellm'
|
||||
param portNumber int = 4000
|
||||
|
||||
resource containerGroupName 'Microsoft.ContainerInstance/containerGroups@2021-03-01' = {
|
||||
name: containerName
|
||||
location: resourceGroup().location
|
||||
properties: {
|
||||
containers: [
|
||||
{
|
||||
name: containerName
|
||||
properties: {
|
||||
image: imageName
|
||||
resources: {
|
||||
requests: {
|
||||
cpu: 1
|
||||
memoryInGB: 2
|
||||
}
|
||||
}
|
||||
ports: [
|
||||
{
|
||||
port: portNumber
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
]
|
||||
osType: 'Linux'
|
||||
restartPolicy: 'Always'
|
||||
ipAddress: {
|
||||
type: 'Public'
|
||||
ports: [
|
||||
{
|
||||
protocol: 'tcp'
|
||||
port: portNumber
|
||||
}
|
||||
]
|
||||
dnsNameLabel: dnsLabelName
|
||||
}
|
||||
}
|
||||
}
|
|
@ -83,6 +83,7 @@ def completion(
|
|||
top_p: Optional[float] = None,
|
||||
n: Optional[int] = None,
|
||||
stream: Optional[bool] = None,
|
||||
stream_options: Optional[dict] = None,
|
||||
stop=None,
|
||||
max_tokens: Optional[int] = None,
|
||||
presence_penalty: Optional[float] = None,
|
||||
|
@ -139,6 +140,10 @@ def completion(
|
|||
|
||||
- `stream`: *boolean or null (optional)* - If set to true, it sends partial message deltas. Tokens will be sent as they become available, with the stream terminated by a [DONE] message.
|
||||
|
||||
- `stream_options` *dict or null (optional)* - Options for streaming response. Only set this when you set `stream: true`
|
||||
|
||||
- `include_usage` *boolean (optional)* - If set, an additional chunk will be streamed before the data: [DONE] message. The usage field on this chunk shows the token usage statistics for the entire request, and the choices field will always be an empty array. All other chunks will also include a usage field, but with a null value.
|
||||
|
||||
- `stop`: *string/ array/ null (optional)* - Up to 4 sequences where the API will stop generating further tokens.
|
||||
|
||||
- `max_tokens`: *integer (optional)* - The maximum number of tokens to generate in the chat completion.
|
||||
|
|
|
@ -47,3 +47,12 @@ Pricing is based on usage. We can figure out a price that works for your team, o
|
|||
<Image img={require('../img/litellm_hosted_ui_router.png')} />
|
||||
|
||||
#### [**🚨 Schedule Call**](https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat)
|
||||
|
||||
## Feature List
|
||||
|
||||
- Easy way to add/remove models
|
||||
- 100% uptime even when models are added/removed
|
||||
- custom callback webhooks
|
||||
- your domain name with HTTPS
|
||||
- Ability to create/delete User API keys
|
||||
- Reasonable set monthly cost
|
|
@ -14,14 +14,14 @@ import TabItem from '@theme/TabItem';
|
|||
|
||||
```python
|
||||
import os
|
||||
from langchain.chat_models import ChatLiteLLM
|
||||
from langchain.prompts.chat import (
|
||||
from langchain_community.chat_models import ChatLiteLLM
|
||||
from langchain_core.prompts import (
|
||||
ChatPromptTemplate,
|
||||
SystemMessagePromptTemplate,
|
||||
AIMessagePromptTemplate,
|
||||
HumanMessagePromptTemplate,
|
||||
)
|
||||
from langchain.schema import AIMessage, HumanMessage, SystemMessage
|
||||
from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
|
||||
|
||||
os.environ['OPENAI_API_KEY'] = ""
|
||||
chat = ChatLiteLLM(model="gpt-3.5-turbo")
|
||||
|
@ -30,7 +30,7 @@ messages = [
|
|||
content="what model are you"
|
||||
)
|
||||
]
|
||||
chat(messages)
|
||||
chat.invoke(messages)
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
|
@ -39,14 +39,14 @@ chat(messages)
|
|||
|
||||
```python
|
||||
import os
|
||||
from langchain.chat_models import ChatLiteLLM
|
||||
from langchain.prompts.chat import (
|
||||
from langchain_community.chat_models import ChatLiteLLM
|
||||
from langchain_core.prompts import (
|
||||
ChatPromptTemplate,
|
||||
SystemMessagePromptTemplate,
|
||||
AIMessagePromptTemplate,
|
||||
HumanMessagePromptTemplate,
|
||||
)
|
||||
from langchain.schema import AIMessage, HumanMessage, SystemMessage
|
||||
from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
|
||||
|
||||
os.environ['ANTHROPIC_API_KEY'] = ""
|
||||
chat = ChatLiteLLM(model="claude-2", temperature=0.3)
|
||||
|
@ -55,7 +55,7 @@ messages = [
|
|||
content="what model are you"
|
||||
)
|
||||
]
|
||||
chat(messages)
|
||||
chat.invoke(messages)
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
|
@ -64,14 +64,14 @@ chat(messages)
|
|||
|
||||
```python
|
||||
import os
|
||||
from langchain.chat_models import ChatLiteLLM
|
||||
from langchain.prompts.chat import (
|
||||
from langchain_community.chat_models import ChatLiteLLM
|
||||
from langchain_core.prompts.chat import (
|
||||
ChatPromptTemplate,
|
||||
SystemMessagePromptTemplate,
|
||||
AIMessagePromptTemplate,
|
||||
HumanMessagePromptTemplate,
|
||||
)
|
||||
from langchain.schema import AIMessage, HumanMessage, SystemMessage
|
||||
from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
|
||||
|
||||
os.environ['REPLICATE_API_TOKEN'] = ""
|
||||
chat = ChatLiteLLM(model="replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1")
|
||||
|
@ -80,7 +80,7 @@ messages = [
|
|||
content="what model are you?"
|
||||
)
|
||||
]
|
||||
chat(messages)
|
||||
chat.invoke(messages)
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
|
@ -89,14 +89,14 @@ chat(messages)
|
|||
|
||||
```python
|
||||
import os
|
||||
from langchain.chat_models import ChatLiteLLM
|
||||
from langchain.prompts.chat import (
|
||||
from langchain_community.chat_models import ChatLiteLLM
|
||||
from langchain_core.prompts import (
|
||||
ChatPromptTemplate,
|
||||
SystemMessagePromptTemplate,
|
||||
AIMessagePromptTemplate,
|
||||
HumanMessagePromptTemplate,
|
||||
)
|
||||
from langchain.schema import AIMessage, HumanMessage, SystemMessage
|
||||
from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
|
||||
|
||||
os.environ['COHERE_API_KEY'] = ""
|
||||
chat = ChatLiteLLM(model="command-nightly")
|
||||
|
@ -105,32 +105,9 @@ messages = [
|
|||
content="what model are you?"
|
||||
)
|
||||
]
|
||||
chat(messages)
|
||||
chat.invoke(messages)
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
<TabItem value="palm" label="PaLM - Google">
|
||||
|
||||
```python
|
||||
import os
|
||||
from langchain.chat_models import ChatLiteLLM
|
||||
from langchain.prompts.chat import (
|
||||
ChatPromptTemplate,
|
||||
SystemMessagePromptTemplate,
|
||||
AIMessagePromptTemplate,
|
||||
HumanMessagePromptTemplate,
|
||||
)
|
||||
from langchain.schema import AIMessage, HumanMessage, SystemMessage
|
||||
|
||||
os.environ['PALM_API_KEY'] = ""
|
||||
chat = ChatLiteLLM(model="palm/chat-bison")
|
||||
messages = [
|
||||
HumanMessage(
|
||||
content="what model are you?"
|
||||
)
|
||||
]
|
||||
chat(messages)
|
||||
```
|
||||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
|
|
|
@ -94,9 +94,10 @@ print(response)
|
|||
|
||||
```
|
||||
|
||||
### Set Custom Trace ID, Trace User ID and Tags
|
||||
### Set Custom Trace ID, Trace User ID, Trace Metadata, Trace Version, Trace Release and Tags
|
||||
|
||||
Pass `trace_id`, `trace_user_id`, `trace_metadata`, `trace_version`, `trace_release`, `tags` in `metadata`
|
||||
|
||||
Pass `trace_id`, `trace_user_id` in `metadata`
|
||||
|
||||
```python
|
||||
import litellm
|
||||
|
@ -121,12 +122,20 @@ response = completion(
|
|||
metadata={
|
||||
"generation_name": "ishaan-test-generation", # set langfuse Generation Name
|
||||
"generation_id": "gen-id22", # set langfuse Generation ID
|
||||
"version": "test-generation-version" # set langfuse Generation Version
|
||||
"trace_user_id": "user-id2", # set langfuse Trace User ID
|
||||
"session_id": "session-1", # set langfuse Session ID
|
||||
"tags": ["tag1", "tag2"] # set langfuse Tags
|
||||
"tags": ["tag1", "tag2"], # set langfuse Tags
|
||||
"trace_id": "trace-id22", # set langfuse Trace ID
|
||||
"trace_metadata": {"key": "value"}, # set langfuse Trace Metadata
|
||||
"trace_version": "test-trace-version", # set langfuse Trace Version (if not set, defaults to Generation Version)
|
||||
"trace_release": "test-trace-release", # set langfuse Trace Release
|
||||
### OR ###
|
||||
"existing_trace_id": "trace-id22", # if generation is continuation of past trace. This prevents default behaviour of setting a trace name
|
||||
"existing_trace_id": "trace-id22", # if generation is continuation of past trace. This prevents default behaviour of setting a trace name
|
||||
### OR enforce that certain fields are trace overwritten in the trace during the continuation ###
|
||||
"existing_trace_id": "trace-id22",
|
||||
"trace_metadata": {"key": "updated_trace_value"}, # The new value to use for the langfuse Trace Metadata
|
||||
"update_trace_keys": ["input", "output", "trace_metadata"], # Updates the trace input & output to be this generations input & output also updates the Trace Metadata to match the passed in value
|
||||
},
|
||||
)
|
||||
|
||||
|
@ -134,6 +143,38 @@ print(response)
|
|||
|
||||
```
|
||||
|
||||
### Trace & Generation Parameters
|
||||
|
||||
#### Trace Specific Parameters
|
||||
|
||||
* `trace_id` - Identifier for the trace, must use `existing_trace_id` instead or in conjunction with `trace_id` if this is an existing trace, auto-generated by default
|
||||
* `trace_name` - Name of the trace, auto-generated by default
|
||||
* `session_id` - Session identifier for the trace, defaults to `None`
|
||||
* `trace_version` - Version for the trace, defaults to value for `version`
|
||||
* `trace_release` - Release for the trace, defaults to `None`
|
||||
* `trace_metadata` - Metadata for the trace, defaults to `None`
|
||||
* `trace_user_id` - User identifier for the trace, defaults to completion argument `user`
|
||||
* `tags` - Tags for the trace, defeaults to `None`
|
||||
|
||||
##### Updatable Parameters on Continuation
|
||||
|
||||
The following parameters can be updated on a continuation of a trace by passing in the following values into the `update_trace_keys` in the metadata of the completion.
|
||||
|
||||
* `input` - Will set the traces input to be the input of this latest generation
|
||||
* `output` - Will set the traces output to be the output of this generation
|
||||
* `trace_version` - Will set the trace version to be the provided value (To use the latest generations version instead, use `version`)
|
||||
* `trace_release` - Will set the trace release to be the provided value
|
||||
* `trace_metadata` - Will set the trace metadata to the provided value
|
||||
* `trace_user_id` - Will set the trace user id to the provided value
|
||||
|
||||
#### Generation Specific Parameters
|
||||
|
||||
* `generation_id` - Identifier for the generation, auto-generated by default
|
||||
* `generation_name` - Identifier for the generation, auto-generated by default
|
||||
* `prompt` - Langfuse prompt object used for the generation, defaults to None
|
||||
|
||||
Any other key value pairs passed into the metadata not listed in the above spec for a `litellm` completion will be added as a metadata key value pair for the generation.
|
||||
|
||||
### Use LangChain ChatLiteLLM + Langfuse
|
||||
Pass `trace_user_id`, `session_id` in model_kwargs
|
||||
```python
|
||||
|
|
54
docs/my-website/docs/providers/deepseek.md
Normal file
54
docs/my-website/docs/providers/deepseek.md
Normal file
|
@ -0,0 +1,54 @@
|
|||
# Deepseek
|
||||
https://deepseek.com/
|
||||
|
||||
**We support ALL Deepseek models, just set `deepseek/` as a prefix when sending completion requests**
|
||||
|
||||
## API Key
|
||||
```python
|
||||
# env variable
|
||||
os.environ['DEEPSEEK_API_KEY']
|
||||
```
|
||||
|
||||
## Sample Usage
|
||||
```python
|
||||
from litellm import completion
|
||||
import os
|
||||
|
||||
os.environ['DEEPSEEK_API_KEY'] = ""
|
||||
response = completion(
|
||||
model="deepseek/deepseek-chat",
|
||||
messages=[
|
||||
{"role": "user", "content": "hello from litellm"}
|
||||
],
|
||||
)
|
||||
print(response)
|
||||
```
|
||||
|
||||
## Sample Usage - Streaming
|
||||
```python
|
||||
from litellm import completion
|
||||
import os
|
||||
|
||||
os.environ['DEEPSEEK_API_KEY'] = ""
|
||||
response = completion(
|
||||
model="deepseek/deepseek-chat",
|
||||
messages=[
|
||||
{"role": "user", "content": "hello from litellm"}
|
||||
],
|
||||
stream=True
|
||||
)
|
||||
|
||||
for chunk in response:
|
||||
print(chunk)
|
||||
```
|
||||
|
||||
|
||||
## Supported Models - ALL Deepseek Models Supported!
|
||||
We support ALL Deepseek models, just set `deepseek/` as a prefix when sending completion requests
|
||||
|
||||
| Model Name | Function Call |
|
||||
|--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||
| deepseek-chat | `completion(model="deepseek/deepseek-chat", messages)` |
|
||||
| deepseek-coder | `completion(model="deepseek/deepseek-chat", messages)` |
|
||||
|
||||
|
|
@ -44,14 +44,14 @@ for chunk in response:
|
|||
## Supported Models
|
||||
All models listed here https://docs.mistral.ai/platform/endpoints are supported. We actively maintain the list of models, pricing, token window, etc. [here](https://github.com/BerriAI/litellm/blob/c1b25538277206b9f00de5254d80d6a83bb19a29/model_prices_and_context_window.json).
|
||||
|
||||
| Model Name | Function Call |
|
||||
|--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||
| mistral-tiny | `completion(model="mistral/mistral-tiny", messages)` |
|
||||
| mistral-small | `completion(model="mistral/mistral-small", messages)` |
|
||||
| mistral-medium | `completion(model="mistral/mistral-medium", messages)` |
|
||||
| mistral-large-latest | `completion(model="mistral/mistral-large-latest", messages)` |
|
||||
| open-mixtral-8x22b | `completion(model="mistral/open-mixtral-8x22b", messages)` |
|
||||
|
||||
| Model Name | Function Call |
|
||||
|----------------|--------------------------------------------------------------|
|
||||
| Mistral Small | `completion(model="mistral/mistral-small-latest", messages)` |
|
||||
| Mistral Medium | `completion(model="mistral/mistral-medium-latest", messages)`|
|
||||
| Mistral Large | `completion(model="mistral/mistral-large-latest", messages)` |
|
||||
| Mistral 7B | `completion(model="mistral/open-mistral-7b", messages)` |
|
||||
| Mixtral 8x7B | `completion(model="mistral/open-mixtral-8x7b", messages)` |
|
||||
| Mixtral 8x22B | `completion(model="mistral/open-mixtral-8x22b", messages)` |
|
||||
|
||||
## Function Calling
|
||||
|
||||
|
@ -116,6 +116,6 @@ All models listed here https://docs.mistral.ai/platform/endpoints are supported
|
|||
|
||||
| Model Name | Function Call |
|
||||
|--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||
| mistral-embed | `embedding(model="mistral/mistral-embed", input)` |
|
||||
| Mistral Embeddings | `embedding(model="mistral/mistral-embed", input)` |
|
||||
|
||||
|
||||
|
|
|
@ -17,6 +17,7 @@ This is a new feature, and subject to changes based on feedback.
|
|||
### Step 1. Setup Proxy
|
||||
|
||||
- `JWT_PUBLIC_KEY_URL`: This is the public keys endpoint of your OpenID provider. Typically it's `{openid-provider-base-url}/.well-known/openid-configuration/jwks`. For Keycloak it's `{keycloak_base_url}/realms/{your-realm}/protocol/openid-connect/certs`.
|
||||
- `JWT_AUDIENCE`: This is the audience used for decoding the JWT. If not set, the decode step will not verify the audience.
|
||||
|
||||
```bash
|
||||
export JWT_PUBLIC_KEY_URL="" # "https://demo.duendesoftware.com/.well-known/openid-configuration/jwks"
|
||||
|
|
|
@ -12,8 +12,8 @@ Requirements:
|
|||
|
||||
You can set budgets at 3 levels:
|
||||
- For the proxy
|
||||
- For a user
|
||||
- For a 'user' passed to `/chat/completions`, `/embeddings` etc
|
||||
- For an internal user
|
||||
- For an end-user
|
||||
- For a key
|
||||
- For a key (model specific budgets)
|
||||
|
||||
|
@ -58,7 +58,7 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
|
|||
}'
|
||||
```
|
||||
</TabItem>
|
||||
<TabItem value="per-user" label="For User">
|
||||
<TabItem value="per-user" label="For Internal User">
|
||||
|
||||
Apply a budget across multiple keys.
|
||||
|
||||
|
@ -165,12 +165,12 @@ curl --location 'http://localhost:4000/team/new' \
|
|||
}
|
||||
```
|
||||
</TabItem>
|
||||
<TabItem value="per-user-chat" label="For 'user' passed to /chat/completions">
|
||||
<TabItem value="per-user-chat" label="For End User">
|
||||
|
||||
Use this to budget `user` passed to `/chat/completions`, **without needing to create a key for every user**
|
||||
|
||||
**Step 1. Modify config.yaml**
|
||||
Define `litellm.max_user_budget`
|
||||
Define `litellm.max_end_user_budget`
|
||||
```yaml
|
||||
general_settings:
|
||||
master_key: sk-1234
|
||||
|
@ -328,7 +328,7 @@ You can set:
|
|||
- max parallel requests
|
||||
|
||||
<Tabs>
|
||||
<TabItem value="per-user" label="Per User">
|
||||
<TabItem value="per-user" label="Per Internal User">
|
||||
|
||||
Use `/user/new`, to persist rate limits across multiple keys.
|
||||
|
||||
|
@ -408,7 +408,7 @@ curl --location 'http://localhost:4000/user/new' \
|
|||
```
|
||||
|
||||
|
||||
## Create new keys for existing user
|
||||
## Create new keys for existing internal user
|
||||
|
||||
Just include user_id in the `/key/generate` request.
|
||||
|
||||
|
|
|
@ -96,7 +96,7 @@ print(response)
|
|||
- `router.aimage_generation()` - async image generation calls
|
||||
|
||||
## Advanced - Routing Strategies
|
||||
#### Routing Strategies - Weighted Pick, Rate Limit Aware, Least Busy, Latency Based
|
||||
#### Routing Strategies - Weighted Pick, Rate Limit Aware, Least Busy, Latency Based, Cost Based
|
||||
|
||||
Router provides 4 strategies for routing your calls across multiple deployments:
|
||||
|
||||
|
@ -467,6 +467,101 @@ async def router_acompletion():
|
|||
asyncio.run(router_acompletion())
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
<TabItem value="lowest-cost" label="Lowest Cost Routing (Async)">
|
||||
|
||||
Picks a deployment based on the lowest cost
|
||||
|
||||
How this works:
|
||||
- Get all healthy deployments
|
||||
- Select all deployments that are under their provided `rpm/tpm` limits
|
||||
- For each deployment check if `litellm_param["model"]` exists in [`litellm_model_cost_map`](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json)
|
||||
- if deployment does not exist in `litellm_model_cost_map` -> use deployment_cost= `$1`
|
||||
- Select deployment with lowest cost
|
||||
|
||||
```python
|
||||
from litellm import Router
|
||||
import asyncio
|
||||
|
||||
model_list = [
|
||||
{
|
||||
"model_name": "gpt-3.5-turbo",
|
||||
"litellm_params": {"model": "gpt-4"},
|
||||
"model_info": {"id": "openai-gpt-4"},
|
||||
},
|
||||
{
|
||||
"model_name": "gpt-3.5-turbo",
|
||||
"litellm_params": {"model": "groq/llama3-8b-8192"},
|
||||
"model_info": {"id": "groq-llama"},
|
||||
},
|
||||
]
|
||||
|
||||
# init router
|
||||
router = Router(model_list=model_list, routing_strategy="cost-based-routing")
|
||||
async def router_acompletion():
|
||||
response = await router.acompletion(
|
||||
model="gpt-3.5-turbo",
|
||||
messages=[{"role": "user", "content": "Hey, how's it going?"}]
|
||||
)
|
||||
print(response)
|
||||
|
||||
print(response._hidden_params["model_id"]) # expect groq-llama, since groq/llama has lowest cost
|
||||
return response
|
||||
|
||||
asyncio.run(router_acompletion())
|
||||
|
||||
```
|
||||
|
||||
|
||||
#### Using Custom Input/Output pricing
|
||||
|
||||
Set `litellm_params["input_cost_per_token"]` and `litellm_params["output_cost_per_token"]` for using custom pricing when routing
|
||||
|
||||
```python
|
||||
model_list = [
|
||||
{
|
||||
"model_name": "gpt-3.5-turbo",
|
||||
"litellm_params": {
|
||||
"model": "azure/chatgpt-v-2",
|
||||
"input_cost_per_token": 0.00003,
|
||||
"output_cost_per_token": 0.00003,
|
||||
},
|
||||
"model_info": {"id": "chatgpt-v-experimental"},
|
||||
},
|
||||
{
|
||||
"model_name": "gpt-3.5-turbo",
|
||||
"litellm_params": {
|
||||
"model": "azure/chatgpt-v-1",
|
||||
"input_cost_per_token": 0.000000001,
|
||||
"output_cost_per_token": 0.00000001,
|
||||
},
|
||||
"model_info": {"id": "chatgpt-v-1"},
|
||||
},
|
||||
{
|
||||
"model_name": "gpt-3.5-turbo",
|
||||
"litellm_params": {
|
||||
"model": "azure/chatgpt-v-5",
|
||||
"input_cost_per_token": 10,
|
||||
"output_cost_per_token": 12,
|
||||
},
|
||||
"model_info": {"id": "chatgpt-v-5"},
|
||||
},
|
||||
]
|
||||
# init router
|
||||
router = Router(model_list=model_list, routing_strategy="cost-based-routing")
|
||||
async def router_acompletion():
|
||||
response = await router.acompletion(
|
||||
model="gpt-3.5-turbo",
|
||||
messages=[{"role": "user", "content": "Hey, how's it going?"}]
|
||||
)
|
||||
print(response)
|
||||
|
||||
print(response._hidden_params["model_id"]) # expect chatgpt-v-1, since chatgpt-v-1 has lowest cost
|
||||
return response
|
||||
|
||||
asyncio.run(router_acompletion())
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
|
||||
</Tabs>
|
||||
|
@ -991,6 +1086,46 @@ async def test_acompletion_caching_on_router_caching_groups():
|
|||
asyncio.run(test_acompletion_caching_on_router_caching_groups())
|
||||
```
|
||||
|
||||
## Alerting 🚨
|
||||
|
||||
Send alerts to slack / your webhook url for the following events
|
||||
- LLM API Exceptions
|
||||
- Slow LLM Responses
|
||||
|
||||
Get a slack webhook url from https://api.slack.com/messaging/webhooks
|
||||
|
||||
#### Usage
|
||||
Initialize an `AlertingConfig` and pass it to `litellm.Router`. The following code will trigger an alert because `api_key=bad-key` which is invalid
|
||||
|
||||
```python
|
||||
from litellm.router import AlertingConfig
|
||||
import litellm
|
||||
import os
|
||||
|
||||
router = litellm.Router(
|
||||
model_list=[
|
||||
{
|
||||
"model_name": "gpt-3.5-turbo",
|
||||
"litellm_params": {
|
||||
"model": "gpt-3.5-turbo",
|
||||
"api_key": "bad_key",
|
||||
},
|
||||
}
|
||||
],
|
||||
alerting_config= AlertingConfig(
|
||||
alerting_threshold=10, # threshold for slow / hanging llm responses (in seconds). Defaults to 300 seconds
|
||||
webhook_url= os.getenv("SLACK_WEBHOOK_URL") # webhook you want to send alerts to
|
||||
),
|
||||
)
|
||||
try:
|
||||
await router.acompletion(
|
||||
model="gpt-3.5-turbo",
|
||||
messages=[{"role": "user", "content": "Hey, how's it going?"}],
|
||||
)
|
||||
except:
|
||||
pass
|
||||
```
|
||||
|
||||
## Track cost for Azure Deployments
|
||||
|
||||
**Problem**: Azure returns `gpt-4` in the response when `azure/gpt-4-1106-preview` is used. This leads to inaccurate cost tracking
|
||||
|
@ -1159,6 +1294,7 @@ def __init__(
|
|||
"least-busy",
|
||||
"usage-based-routing",
|
||||
"latency-based-routing",
|
||||
"cost-based-routing",
|
||||
] = "simple-shuffle",
|
||||
|
||||
## DEBUGGING ##
|
||||
|
|
|
@ -134,6 +134,7 @@ const sidebars = {
|
|||
"providers/ollama",
|
||||
"providers/perplexity",
|
||||
"providers/groq",
|
||||
"providers/deepseek",
|
||||
"providers/fireworks_ai",
|
||||
"providers/vllm",
|
||||
"providers/xinference",
|
||||
|
|
|
@ -291,7 +291,7 @@ def _create_clickhouse_aggregate_tables(client=None, table_names=[]):
|
|||
|
||||
|
||||
def _forecast_daily_cost(data: list):
|
||||
import requests
|
||||
import requests # type: ignore
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
if len(data) == 0:
|
||||
|
|
|
@ -361,6 +361,7 @@ openai_compatible_endpoints: List = [
|
|||
"api.deepinfra.com/v1/openai",
|
||||
"api.mistral.ai/v1",
|
||||
"api.groq.com/openai/v1",
|
||||
"api.deepseek.com/v1",
|
||||
"api.together.xyz/v1",
|
||||
]
|
||||
|
||||
|
@ -369,6 +370,7 @@ openai_compatible_providers: List = [
|
|||
"anyscale",
|
||||
"mistral",
|
||||
"groq",
|
||||
"deepseek",
|
||||
"deepinfra",
|
||||
"perplexity",
|
||||
"xinference",
|
||||
|
@ -523,6 +525,7 @@ provider_list: List = [
|
|||
"anyscale",
|
||||
"mistral",
|
||||
"groq",
|
||||
"deepseek",
|
||||
"maritalk",
|
||||
"voyage",
|
||||
"cloudflare",
|
||||
|
|
|
@ -10,8 +10,8 @@
|
|||
# s/o [@Frank Colson](https://www.linkedin.com/in/frank-colson-422b9b183/) for this redis implementation
|
||||
import os
|
||||
import inspect
|
||||
import redis, litellm
|
||||
import redis.asyncio as async_redis
|
||||
import redis, litellm # type: ignore
|
||||
import redis.asyncio as async_redis # type: ignore
|
||||
from typing import List, Optional
|
||||
|
||||
|
||||
|
|
|
@ -10,7 +10,7 @@
|
|||
import os, json, time
|
||||
import litellm
|
||||
from litellm.utils import ModelResponse
|
||||
import requests, threading
|
||||
import requests, threading # type: ignore
|
||||
from typing import Optional, Union, Literal
|
||||
|
||||
|
||||
|
|
|
@ -106,7 +106,7 @@ class InMemoryCache(BaseCache):
|
|||
return_val.append(val)
|
||||
return return_val
|
||||
|
||||
async def async_increment(self, key, value: int, **kwargs) -> int:
|
||||
async def async_increment(self, key, value: float, **kwargs) -> float:
|
||||
# get the value
|
||||
init_value = await self.async_get_cache(key=key) or 0
|
||||
value = init_value + value
|
||||
|
@ -423,12 +423,12 @@ class RedisCache(BaseCache):
|
|||
if len(self.redis_batch_writing_buffer) >= self.redis_flush_size:
|
||||
await self.flush_cache_buffer() # logging done in here
|
||||
|
||||
async def async_increment(self, key, value: int, **kwargs) -> int:
|
||||
async def async_increment(self, key, value: float, **kwargs) -> float:
|
||||
_redis_client = self.init_async_client()
|
||||
start_time = time.time()
|
||||
try:
|
||||
async with _redis_client as redis_client:
|
||||
result = await redis_client.incr(name=key, amount=value)
|
||||
result = await redis_client.incrbyfloat(name=key, amount=value)
|
||||
## LOGGING ##
|
||||
end_time = time.time()
|
||||
_duration = end_time - start_time
|
||||
|
@ -1382,18 +1382,41 @@ class DualCache(BaseCache):
|
|||
print_verbose(f"LiteLLM Cache: Excepton async add_cache: {str(e)}")
|
||||
traceback.print_exc()
|
||||
|
||||
async def async_batch_set_cache(
|
||||
self, cache_list: list, local_only: bool = False, **kwargs
|
||||
):
|
||||
"""
|
||||
Batch write values to the cache
|
||||
"""
|
||||
print_verbose(
|
||||
f"async batch set cache: cache keys: {cache_list}; local_only: {local_only}"
|
||||
)
|
||||
try:
|
||||
if self.in_memory_cache is not None:
|
||||
await self.in_memory_cache.async_set_cache_pipeline(
|
||||
cache_list=cache_list, **kwargs
|
||||
)
|
||||
|
||||
if self.redis_cache is not None and local_only == False:
|
||||
await self.redis_cache.async_set_cache_pipeline(
|
||||
cache_list=cache_list, ttl=kwargs.get("ttl", None)
|
||||
)
|
||||
except Exception as e:
|
||||
print_verbose(f"LiteLLM Cache: Excepton async add_cache: {str(e)}")
|
||||
traceback.print_exc()
|
||||
|
||||
async def async_increment_cache(
|
||||
self, key, value: int, local_only: bool = False, **kwargs
|
||||
) -> int:
|
||||
self, key, value: float, local_only: bool = False, **kwargs
|
||||
) -> float:
|
||||
"""
|
||||
Key - the key in cache
|
||||
|
||||
Value - int - the value you want to increment by
|
||||
Value - float - the value you want to increment by
|
||||
|
||||
Returns - int - the incremented value
|
||||
Returns - float - the incremented value
|
||||
"""
|
||||
try:
|
||||
result: int = value
|
||||
result: float = value
|
||||
if self.in_memory_cache is not None:
|
||||
result = await self.in_memory_cache.async_increment(
|
||||
key, value, **kwargs
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
#### What this does ####
|
||||
# On success + failure, log events to aispend.io
|
||||
import dotenv, os
|
||||
import requests
|
||||
|
||||
dotenv.load_dotenv() # Loading env variables using dotenv
|
||||
import traceback
|
||||
|
|
|
@ -4,18 +4,30 @@ import datetime
|
|||
class AthinaLogger:
|
||||
def __init__(self):
|
||||
import os
|
||||
|
||||
self.athina_api_key = os.getenv("ATHINA_API_KEY")
|
||||
self.headers = {
|
||||
"athina-api-key": self.athina_api_key,
|
||||
"Content-Type": "application/json"
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
self.athina_logging_url = "https://log.athina.ai/api/v1/log/inference"
|
||||
self.additional_keys = ["environment", "prompt_slug", "customer_id", "customer_user_id", "session_id", "external_reference_id", "context", "expected_response", "user_query"]
|
||||
self.additional_keys = [
|
||||
"environment",
|
||||
"prompt_slug",
|
||||
"customer_id",
|
||||
"customer_user_id",
|
||||
"session_id",
|
||||
"external_reference_id",
|
||||
"context",
|
||||
"expected_response",
|
||||
"user_query",
|
||||
]
|
||||
|
||||
def log_event(self, kwargs, response_obj, start_time, end_time, print_verbose):
|
||||
import requests
|
||||
import requests # type: ignore
|
||||
import json
|
||||
import traceback
|
||||
|
||||
try:
|
||||
response_json = response_obj.model_dump() if response_obj else {}
|
||||
data = {
|
||||
|
@ -23,19 +35,30 @@ class AthinaLogger:
|
|||
"request": kwargs,
|
||||
"response": response_json,
|
||||
"prompt_tokens": response_json.get("usage", {}).get("prompt_tokens"),
|
||||
"completion_tokens": response_json.get("usage", {}).get("completion_tokens"),
|
||||
"completion_tokens": response_json.get("usage", {}).get(
|
||||
"completion_tokens"
|
||||
),
|
||||
"total_tokens": response_json.get("usage", {}).get("total_tokens"),
|
||||
}
|
||||
|
||||
if type(end_time) == datetime.datetime and type(start_time) == datetime.datetime:
|
||||
data["response_time"] = int((end_time - start_time).total_seconds() * 1000)
|
||||
if (
|
||||
type(end_time) == datetime.datetime
|
||||
and type(start_time) == datetime.datetime
|
||||
):
|
||||
data["response_time"] = int(
|
||||
(end_time - start_time).total_seconds() * 1000
|
||||
)
|
||||
|
||||
if "messages" in kwargs:
|
||||
data["prompt"] = kwargs.get("messages", None)
|
||||
|
||||
# Directly add tools or functions if present
|
||||
optional_params = kwargs.get("optional_params", {})
|
||||
data.update((k, v) for k, v in optional_params.items() if k in ["tools", "functions"])
|
||||
data.update(
|
||||
(k, v)
|
||||
for k, v in optional_params.items()
|
||||
if k in ["tools", "functions"]
|
||||
)
|
||||
|
||||
# Add additional metadata keys
|
||||
metadata = kwargs.get("litellm_params", {}).get("metadata", {})
|
||||
|
@ -44,11 +67,19 @@ class AthinaLogger:
|
|||
if key in metadata:
|
||||
data[key] = metadata[key]
|
||||
|
||||
response = requests.post(self.athina_logging_url, headers=self.headers, data=json.dumps(data, default=str))
|
||||
response = requests.post(
|
||||
self.athina_logging_url,
|
||||
headers=self.headers,
|
||||
data=json.dumps(data, default=str),
|
||||
)
|
||||
if response.status_code != 200:
|
||||
print_verbose(f"Athina Logger Error - {response.text}, {response.status_code}")
|
||||
print_verbose(
|
||||
f"Athina Logger Error - {response.text}, {response.status_code}"
|
||||
)
|
||||
else:
|
||||
print_verbose(f"Athina Logger Succeeded - {response.text}")
|
||||
except Exception as e:
|
||||
print_verbose(f"Athina Logger Error - {e}, Stack trace: {traceback.format_exc()}")
|
||||
print_verbose(
|
||||
f"Athina Logger Error - {e}, Stack trace: {traceback.format_exc()}"
|
||||
)
|
||||
pass
|
|
@ -1,7 +1,7 @@
|
|||
#### What this does ####
|
||||
# On success + failure, log events to aispend.io
|
||||
import dotenv, os
|
||||
import requests
|
||||
import requests # type: ignore
|
||||
|
||||
dotenv.load_dotenv() # Loading env variables using dotenv
|
||||
import traceback
|
||||
|
|
|
@ -3,7 +3,6 @@
|
|||
#### What this does ####
|
||||
# On success, logs events to Promptlayer
|
||||
import dotenv, os
|
||||
import requests
|
||||
|
||||
from litellm.proxy._types import UserAPIKeyAuth
|
||||
from litellm.caching import DualCache
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
#### What this does ####
|
||||
# On success, logs events to Promptlayer
|
||||
import dotenv, os
|
||||
import requests
|
||||
|
||||
from litellm.proxy._types import UserAPIKeyAuth
|
||||
from litellm.caching import DualCache
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
# On success + failure, log events to Supabase
|
||||
|
||||
import dotenv, os
|
||||
import requests
|
||||
import requests # type: ignore
|
||||
|
||||
dotenv.load_dotenv() # Loading env variables using dotenv
|
||||
import traceback
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
# On success + failure, log events to Supabase
|
||||
|
||||
import dotenv, os
|
||||
import requests
|
||||
import requests # type: ignore
|
||||
|
||||
dotenv.load_dotenv() # Loading env variables using dotenv
|
||||
import traceback
|
||||
|
|
|
@ -1,15 +1,17 @@
|
|||
import requests
|
||||
import requests # type: ignore
|
||||
import json
|
||||
import traceback
|
||||
from datetime import datetime, timezone
|
||||
|
||||
|
||||
class GreenscaleLogger:
|
||||
def __init__(self):
|
||||
import os
|
||||
|
||||
self.greenscale_api_key = os.getenv("GREENSCALE_API_KEY")
|
||||
self.headers = {
|
||||
"api-key": self.greenscale_api_key,
|
||||
"Content-Type": "application/json"
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
self.greenscale_logging_url = os.getenv("GREENSCALE_ENDPOINT")
|
||||
|
||||
|
@ -19,13 +21,18 @@ class GreenscaleLogger:
|
|||
data = {
|
||||
"modelId": kwargs.get("model"),
|
||||
"inputTokenCount": response_json.get("usage", {}).get("prompt_tokens"),
|
||||
"outputTokenCount": response_json.get("usage", {}).get("completion_tokens"),
|
||||
"outputTokenCount": response_json.get("usage", {}).get(
|
||||
"completion_tokens"
|
||||
),
|
||||
}
|
||||
data["timestamp"] = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')
|
||||
data["timestamp"] = datetime.now(timezone.utc).strftime(
|
||||
"%Y-%m-%dT%H:%M:%SZ"
|
||||
)
|
||||
|
||||
if type(end_time) == datetime and type(start_time) == datetime:
|
||||
data["invocationLatency"] = int((end_time - start_time).total_seconds() * 1000)
|
||||
|
||||
data["invocationLatency"] = int(
|
||||
(end_time - start_time).total_seconds() * 1000
|
||||
)
|
||||
|
||||
# Add additional metadata keys to tags
|
||||
tags = []
|
||||
|
@ -37,15 +44,25 @@ class GreenscaleLogger:
|
|||
elif key == "greenscale_application":
|
||||
data["application"] = value
|
||||
else:
|
||||
tags.append({"key": key.replace("greenscale_", ""), "value": str(value)})
|
||||
tags.append(
|
||||
{"key": key.replace("greenscale_", ""), "value": str(value)}
|
||||
)
|
||||
|
||||
data["tags"] = tags
|
||||
|
||||
response = requests.post(self.greenscale_logging_url, headers=self.headers, data=json.dumps(data, default=str))
|
||||
response = requests.post(
|
||||
self.greenscale_logging_url,
|
||||
headers=self.headers,
|
||||
data=json.dumps(data, default=str),
|
||||
)
|
||||
if response.status_code != 200:
|
||||
print_verbose(f"Greenscale Logger Error - {response.text}, {response.status_code}")
|
||||
print_verbose(
|
||||
f"Greenscale Logger Error - {response.text}, {response.status_code}"
|
||||
)
|
||||
else:
|
||||
print_verbose(f"Greenscale Logger Succeeded - {response.text}")
|
||||
except Exception as e:
|
||||
print_verbose(f"Greenscale Logger Error - {e}, Stack trace: {traceback.format_exc()}")
|
||||
print_verbose(
|
||||
f"Greenscale Logger Error - {e}, Stack trace: {traceback.format_exc()}"
|
||||
)
|
||||
pass
|
|
@ -1,7 +1,7 @@
|
|||
#### What this does ####
|
||||
# On success, logs events to Helicone
|
||||
import dotenv, os
|
||||
import requests
|
||||
import requests # type: ignore
|
||||
import litellm
|
||||
|
||||
dotenv.load_dotenv() # Loading env variables using dotenv
|
||||
|
|
|
@ -262,6 +262,7 @@ class LangFuseLogger:
|
|||
|
||||
try:
|
||||
tags = []
|
||||
metadata = copy.deepcopy(metadata) # Avoid modifying the original metadata
|
||||
supports_tags = Version(langfuse.version.__version__) >= Version("2.6.3")
|
||||
supports_prompt = Version(langfuse.version.__version__) >= Version("2.7.3")
|
||||
supports_costs = Version(langfuse.version.__version__) >= Version("2.7.3")
|
||||
|
@ -272,36 +273,9 @@ class LangFuseLogger:
|
|||
print_verbose(f"Langfuse Layer Logging - logging to langfuse v2 ")
|
||||
|
||||
if supports_tags:
|
||||
metadata_tags = metadata.get("tags", [])
|
||||
metadata_tags = metadata.pop("tags", [])
|
||||
tags = metadata_tags
|
||||
|
||||
trace_name = metadata.get("trace_name", None)
|
||||
trace_id = metadata.get("trace_id", None)
|
||||
existing_trace_id = metadata.get("existing_trace_id", None)
|
||||
if trace_name is None and existing_trace_id is None:
|
||||
# just log `litellm-{call_type}` as the trace name
|
||||
## DO NOT SET TRACE_NAME if trace-id set. this can lead to overwriting of past traces.
|
||||
trace_name = f"litellm-{kwargs.get('call_type', 'completion')}"
|
||||
|
||||
if existing_trace_id is not None:
|
||||
trace_params = {"id": existing_trace_id}
|
||||
else: # don't overwrite an existing trace
|
||||
trace_params = {
|
||||
"name": trace_name,
|
||||
"input": input,
|
||||
"user_id": metadata.get("trace_user_id", user_id),
|
||||
"id": trace_id,
|
||||
"session_id": metadata.get("session_id", None),
|
||||
}
|
||||
|
||||
if level == "ERROR":
|
||||
trace_params["status_message"] = output
|
||||
else:
|
||||
trace_params["output"] = output
|
||||
|
||||
cost = kwargs.get("response_cost", None)
|
||||
print_verbose(f"trace: {cost}")
|
||||
|
||||
# Clean Metadata before logging - never log raw metadata
|
||||
# the raw metadata can contain circular references which leads to infinite recursion
|
||||
# we clean out all extra litellm metadata params before logging
|
||||
|
@ -328,6 +302,66 @@ class LangFuseLogger:
|
|||
else:
|
||||
clean_metadata[key] = value
|
||||
|
||||
session_id = clean_metadata.pop("session_id", None)
|
||||
trace_name = clean_metadata.pop("trace_name", None)
|
||||
trace_id = clean_metadata.pop("trace_id", None)
|
||||
existing_trace_id = clean_metadata.pop("existing_trace_id", None)
|
||||
update_trace_keys = clean_metadata.pop("update_trace_keys", [])
|
||||
|
||||
if trace_name is None and existing_trace_id is None:
|
||||
# just log `litellm-{call_type}` as the trace name
|
||||
## DO NOT SET TRACE_NAME if trace-id set. this can lead to overwriting of past traces.
|
||||
trace_name = f"litellm-{kwargs.get('call_type', 'completion')}"
|
||||
|
||||
if existing_trace_id is not None:
|
||||
trace_params = {"id": existing_trace_id}
|
||||
|
||||
# Update the following keys for this trace
|
||||
for metadata_param_key in update_trace_keys:
|
||||
trace_param_key = metadata_param_key.replace("trace_", "")
|
||||
if trace_param_key not in trace_params:
|
||||
updated_trace_value = clean_metadata.pop(
|
||||
metadata_param_key, None
|
||||
)
|
||||
if updated_trace_value is not None:
|
||||
trace_params[trace_param_key] = updated_trace_value
|
||||
|
||||
# Pop the trace specific keys that would have been popped if there were a new trace
|
||||
for key in list(
|
||||
filter(lambda key: key.startswith("trace_"), clean_metadata.keys())
|
||||
):
|
||||
clean_metadata.pop(key, None)
|
||||
|
||||
# Special keys that are found in the function arguments and not the metadata
|
||||
if "input" in update_trace_keys:
|
||||
trace_params["input"] = input
|
||||
if "output" in update_trace_keys:
|
||||
trace_params["output"] = output
|
||||
else: # don't overwrite an existing trace
|
||||
trace_params = {
|
||||
"id": trace_id,
|
||||
"name": trace_name,
|
||||
"session_id": session_id,
|
||||
"input": input,
|
||||
"version": clean_metadata.pop(
|
||||
"trace_version", clean_metadata.get("version", None)
|
||||
), # If provided just version, it will applied to the trace as well, if applied a trace version it will take precedence
|
||||
}
|
||||
for key in list(
|
||||
filter(lambda key: key.startswith("trace_"), clean_metadata.keys())
|
||||
):
|
||||
trace_params[key.replace("trace_", "")] = clean_metadata.pop(
|
||||
key, None
|
||||
)
|
||||
|
||||
if level == "ERROR":
|
||||
trace_params["status_message"] = output
|
||||
else:
|
||||
trace_params["output"] = output
|
||||
|
||||
cost = kwargs.get("response_cost", None)
|
||||
print_verbose(f"trace: {cost}")
|
||||
|
||||
if (
|
||||
litellm._langfuse_default_tags is not None
|
||||
and isinstance(litellm._langfuse_default_tags, list)
|
||||
|
@ -387,7 +421,7 @@ class LangFuseLogger:
|
|||
"completion_tokens": response_obj["usage"]["completion_tokens"],
|
||||
"total_cost": cost if supports_costs else None,
|
||||
}
|
||||
generation_name = metadata.get("generation_name", None)
|
||||
generation_name = clean_metadata.pop("generation_name", None)
|
||||
if generation_name is None:
|
||||
# just log `litellm-{call_type}` as the generation name
|
||||
generation_name = f"litellm-{kwargs.get('call_type', 'completion')}"
|
||||
|
@ -402,7 +436,7 @@ class LangFuseLogger:
|
|||
|
||||
generation_params = {
|
||||
"name": generation_name,
|
||||
"id": metadata.get("generation_id", generation_id),
|
||||
"id": clean_metadata.pop("generation_id", generation_id),
|
||||
"start_time": start_time,
|
||||
"end_time": end_time,
|
||||
"model": kwargs["model"],
|
||||
|
@ -412,10 +446,11 @@ class LangFuseLogger:
|
|||
"usage": usage,
|
||||
"metadata": clean_metadata,
|
||||
"level": level,
|
||||
"version": clean_metadata.pop("version", None),
|
||||
}
|
||||
|
||||
if supports_prompt:
|
||||
generation_params["prompt"] = metadata.get("prompt", None)
|
||||
generation_params["prompt"] = clean_metadata.pop("prompt", None)
|
||||
|
||||
if output is not None and isinstance(output, str) and level == "ERROR":
|
||||
generation_params["status_message"] = output
|
||||
|
|
|
@ -1,15 +1,14 @@
|
|||
#### What this does ####
|
||||
# On success, logs events to Langsmith
|
||||
import dotenv, os
|
||||
import requests
|
||||
import requests
|
||||
import dotenv, os # type: ignore
|
||||
import requests # type: ignore
|
||||
from datetime import datetime
|
||||
|
||||
dotenv.load_dotenv() # Loading env variables using dotenv
|
||||
import traceback
|
||||
import asyncio
|
||||
import types
|
||||
from pydantic import BaseModel
|
||||
from pydantic import BaseModel # type: ignore
|
||||
|
||||
|
||||
def is_serializable(value):
|
||||
|
@ -79,8 +78,6 @@ class LangsmithLogger:
|
|||
except:
|
||||
response_obj = response_obj.dict() # type: ignore
|
||||
|
||||
print(f"response_obj: {response_obj}")
|
||||
|
||||
data = {
|
||||
"name": run_name,
|
||||
"run_type": "llm", # this should always be llm, since litellm always logs llm calls. Langsmith allow us to log "chain"
|
||||
|
@ -90,7 +87,6 @@ class LangsmithLogger:
|
|||
"start_time": start_time,
|
||||
"end_time": end_time,
|
||||
}
|
||||
print(f"data: {data}")
|
||||
|
||||
response = requests.post(
|
||||
"https://api.smith.langchain.com/runs",
|
||||
|
|
|
@ -2,7 +2,6 @@
|
|||
## On Success events log cost to OpenMeter - https://github.com/BerriAI/litellm/issues/1268
|
||||
|
||||
import dotenv, os, json
|
||||
import requests
|
||||
import litellm
|
||||
|
||||
dotenv.load_dotenv() # Loading env variables using dotenv
|
||||
|
@ -60,7 +59,7 @@ class OpenMeterLogger(CustomLogger):
|
|||
"total_tokens": response_obj["usage"].get("total_tokens"),
|
||||
}
|
||||
|
||||
subject = kwargs.get("user", None), # end-user passed in via 'user' param
|
||||
subject = (kwargs.get("user", None),) # end-user passed in via 'user' param
|
||||
if not subject:
|
||||
raise Exception("OpenMeter: user is required")
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
# On success, log events to Prometheus
|
||||
|
||||
import dotenv, os
|
||||
import requests
|
||||
import requests # type: ignore
|
||||
|
||||
dotenv.load_dotenv() # Loading env variables using dotenv
|
||||
import traceback
|
||||
|
@ -19,7 +19,6 @@ class PrometheusLogger:
|
|||
**kwargs,
|
||||
):
|
||||
try:
|
||||
print(f"in init prometheus metrics")
|
||||
from prometheus_client import Counter
|
||||
|
||||
self.litellm_llm_api_failed_requests_metric = Counter(
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
|
||||
|
||||
import dotenv, os
|
||||
import requests
|
||||
import requests # type: ignore
|
||||
|
||||
dotenv.load_dotenv() # Loading env variables using dotenv
|
||||
import traceback
|
||||
|
@ -183,7 +183,6 @@ class PrometheusServicesLogger:
|
|||
)
|
||||
|
||||
async def async_service_failure_hook(self, payload: ServiceLoggerPayload):
|
||||
print(f"received error payload: {payload.error}")
|
||||
if self.mock_testing:
|
||||
self.mock_testing_failure_calls += 1
|
||||
|
||||
|
|
|
@ -1,12 +1,13 @@
|
|||
#### What this does ####
|
||||
# On success, logs events to Promptlayer
|
||||
import dotenv, os
|
||||
import requests
|
||||
import requests # type: ignore
|
||||
from pydantic import BaseModel
|
||||
|
||||
dotenv.load_dotenv() # Loading env variables using dotenv
|
||||
import traceback
|
||||
|
||||
|
||||
class PromptLayerLogger:
|
||||
# Class variables or attributes
|
||||
def __init__(self):
|
||||
|
@ -32,7 +33,11 @@ class PromptLayerLogger:
|
|||
tags = kwargs["litellm_params"]["metadata"]["pl_tags"]
|
||||
|
||||
# Remove "pl_tags" from metadata
|
||||
metadata = {k:v for k, v in kwargs["litellm_params"]["metadata"].items() if k != "pl_tags"}
|
||||
metadata = {
|
||||
k: v
|
||||
for k, v in kwargs["litellm_params"]["metadata"].items()
|
||||
if k != "pl_tags"
|
||||
}
|
||||
|
||||
print_verbose(
|
||||
f"Prompt Layer Logging - Enters logging function for model kwargs: {new_kwargs}\n, response: {response_obj}"
|
||||
|
|
|
@ -2,7 +2,6 @@
|
|||
# On success + failure, log events to Supabase
|
||||
|
||||
import dotenv, os
|
||||
import requests
|
||||
|
||||
dotenv.load_dotenv() # Loading env variables using dotenv
|
||||
import traceback
|
||||
|
|
|
@ -1,25 +1,82 @@
|
|||
#### What this does ####
|
||||
# Class for sending Slack Alerts #
|
||||
import dotenv, os
|
||||
from litellm.proxy._types import UserAPIKeyAuth
|
||||
|
||||
dotenv.load_dotenv() # Loading env variables using dotenv
|
||||
import copy
|
||||
import traceback
|
||||
from litellm._logging import verbose_logger, verbose_proxy_logger
|
||||
import litellm
|
||||
import litellm, threading
|
||||
from typing import List, Literal, Any, Union, Optional, Dict
|
||||
from litellm.caching import DualCache
|
||||
import asyncio
|
||||
import aiohttp
|
||||
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
|
||||
import datetime
|
||||
from pydantic import BaseModel
|
||||
from enum import Enum
|
||||
from datetime import datetime as dt, timedelta
|
||||
from litellm.integrations.custom_logger import CustomLogger
|
||||
import random
|
||||
|
||||
|
||||
class SlackAlerting:
|
||||
class LiteLLMBase(BaseModel):
|
||||
"""
|
||||
Implements default functions, all pydantic objects should have.
|
||||
"""
|
||||
|
||||
def json(self, **kwargs):
|
||||
try:
|
||||
return self.model_dump() # noqa
|
||||
except:
|
||||
# if using pydantic v1
|
||||
return self.dict()
|
||||
|
||||
|
||||
class SlackAlertingArgs(LiteLLMBase):
|
||||
daily_report_frequency: int = 12 * 60 * 60 # 12 hours
|
||||
report_check_interval: int = 5 * 60 # 5 minutes
|
||||
|
||||
|
||||
class DeploymentMetrics(LiteLLMBase):
|
||||
"""
|
||||
Metrics per deployment, stored in cache
|
||||
|
||||
Used for daily reporting
|
||||
"""
|
||||
|
||||
id: str
|
||||
"""id of deployment in router model list"""
|
||||
|
||||
failed_request: bool
|
||||
"""did it fail the request?"""
|
||||
|
||||
latency_per_output_token: Optional[float]
|
||||
"""latency/output token of deployment"""
|
||||
|
||||
updated_at: dt
|
||||
"""Current time of deployment being updated"""
|
||||
|
||||
|
||||
class SlackAlertingCacheKeys(Enum):
|
||||
"""
|
||||
Enum for deployment daily metrics keys - {deployment_id}:{enum}
|
||||
"""
|
||||
|
||||
failed_requests_key = "failed_requests_daily_metrics"
|
||||
latency_key = "latency_daily_metrics"
|
||||
report_sent_key = "daily_metrics_report_sent"
|
||||
|
||||
|
||||
class SlackAlerting(CustomLogger):
|
||||
"""
|
||||
Class for sending Slack Alerts
|
||||
"""
|
||||
|
||||
# Class variables or attributes
|
||||
def __init__(
|
||||
self,
|
||||
alerting_threshold: float = 300,
|
||||
internal_usage_cache: Optional[DualCache] = None,
|
||||
alerting_threshold: float = 300, # threshold for slow / hanging llm responses (in seconds)
|
||||
alerting: Optional[List] = [],
|
||||
alert_types: Optional[
|
||||
List[
|
||||
|
@ -29,6 +86,7 @@ class SlackAlerting:
|
|||
"llm_requests_hanging",
|
||||
"budget_alerts",
|
||||
"db_exceptions",
|
||||
"daily_reports",
|
||||
]
|
||||
]
|
||||
] = [
|
||||
|
@ -37,18 +95,23 @@ class SlackAlerting:
|
|||
"llm_requests_hanging",
|
||||
"budget_alerts",
|
||||
"db_exceptions",
|
||||
"daily_reports",
|
||||
],
|
||||
alert_to_webhook_url: Optional[
|
||||
Dict
|
||||
] = None, # if user wants to separate alerts to diff channels
|
||||
alerting_args={},
|
||||
default_webhook_url: Optional[str] = None,
|
||||
):
|
||||
self.alerting_threshold = alerting_threshold
|
||||
self.alerting = alerting
|
||||
self.alert_types = alert_types
|
||||
self.internal_usage_cache = DualCache()
|
||||
self.internal_usage_cache = internal_usage_cache or DualCache()
|
||||
self.async_http_handler = AsyncHTTPHandler()
|
||||
self.alert_to_webhook_url = alert_to_webhook_url
|
||||
pass
|
||||
self.is_running = False
|
||||
self.alerting_args = SlackAlertingArgs(**alerting_args)
|
||||
self.default_webhook_url = default_webhook_url
|
||||
|
||||
def update_values(
|
||||
self,
|
||||
|
@ -56,6 +119,7 @@ class SlackAlerting:
|
|||
alerting_threshold: Optional[float] = None,
|
||||
alert_types: Optional[List] = None,
|
||||
alert_to_webhook_url: Optional[Dict] = None,
|
||||
alerting_args: Optional[Dict] = None,
|
||||
):
|
||||
if alerting is not None:
|
||||
self.alerting = alerting
|
||||
|
@ -63,7 +127,8 @@ class SlackAlerting:
|
|||
self.alerting_threshold = alerting_threshold
|
||||
if alert_types is not None:
|
||||
self.alert_types = alert_types
|
||||
|
||||
if alerting_args is not None:
|
||||
self.alerting_args = SlackAlertingArgs(**alerting_args)
|
||||
if alert_to_webhook_url is not None:
|
||||
# update the dict
|
||||
if self.alert_to_webhook_url is None:
|
||||
|
@ -90,18 +155,23 @@ class SlackAlerting:
|
|||
|
||||
def _add_langfuse_trace_id_to_alert(
|
||||
self,
|
||||
request_info: str,
|
||||
request_data: Optional[dict] = None,
|
||||
kwargs: Optional[dict] = None,
|
||||
type: Literal["hanging_request", "slow_response"] = "hanging_request",
|
||||
start_time: Optional[datetime.datetime] = None,
|
||||
end_time: Optional[datetime.datetime] = None,
|
||||
):
|
||||
) -> Optional[str]:
|
||||
"""
|
||||
Returns langfuse trace url
|
||||
"""
|
||||
# do nothing for now
|
||||
pass
|
||||
return request_info
|
||||
if (
|
||||
request_data is not None
|
||||
and request_data.get("metadata", {}).get("trace_id", None) is not None
|
||||
):
|
||||
trace_id = request_data["metadata"]["trace_id"]
|
||||
if litellm.utils.langFuseLogger is not None:
|
||||
base_url = litellm.utils.langFuseLogger.Langfuse.base_url
|
||||
return f"{base_url}/trace/{trace_id}"
|
||||
return None
|
||||
|
||||
def _response_taking_too_long_callback(
|
||||
def _response_taking_too_long_callback_helper(
|
||||
self,
|
||||
kwargs, # kwargs to completion
|
||||
start_time,
|
||||
|
@ -166,7 +236,7 @@ class SlackAlerting:
|
|||
return
|
||||
|
||||
time_difference_float, model, api_base, messages = (
|
||||
self._response_taking_too_long_callback(
|
||||
self._response_taking_too_long_callback_helper(
|
||||
kwargs=kwargs,
|
||||
start_time=start_time,
|
||||
end_time=end_time,
|
||||
|
@ -182,6 +252,9 @@ class SlackAlerting:
|
|||
and "metadata" in kwargs["litellm_params"]
|
||||
):
|
||||
_metadata = kwargs["litellm_params"]["metadata"]
|
||||
request_info = litellm.utils._add_key_name_and_team_to_alert(
|
||||
request_info=request_info, metadata=_metadata
|
||||
)
|
||||
|
||||
_deployment_latency_map = self._get_deployment_latencies_to_alert(
|
||||
metadata=_metadata
|
||||
|
@ -196,8 +269,178 @@ class SlackAlerting:
|
|||
alert_type="llm_too_slow",
|
||||
)
|
||||
|
||||
async def log_failure_event(self, original_exception: Exception):
|
||||
pass
|
||||
async def async_update_daily_reports(
|
||||
self, deployment_metrics: DeploymentMetrics
|
||||
) -> int:
|
||||
"""
|
||||
Store the perf by deployment in cache
|
||||
- Number of failed requests per deployment
|
||||
- Latency / output tokens per deployment
|
||||
|
||||
'deployment_id:daily_metrics:failed_requests'
|
||||
'deployment_id:daily_metrics:latency_per_output_token'
|
||||
|
||||
Returns
|
||||
int - count of metrics set (1 - if just latency, 2 - if failed + latency)
|
||||
"""
|
||||
|
||||
return_val = 0
|
||||
try:
|
||||
## FAILED REQUESTS ##
|
||||
if deployment_metrics.failed_request:
|
||||
await self.internal_usage_cache.async_increment_cache(
|
||||
key="{}:{}".format(
|
||||
deployment_metrics.id,
|
||||
SlackAlertingCacheKeys.failed_requests_key.value,
|
||||
),
|
||||
value=1,
|
||||
)
|
||||
|
||||
return_val += 1
|
||||
|
||||
## LATENCY ##
|
||||
if deployment_metrics.latency_per_output_token is not None:
|
||||
await self.internal_usage_cache.async_increment_cache(
|
||||
key="{}:{}".format(
|
||||
deployment_metrics.id, SlackAlertingCacheKeys.latency_key.value
|
||||
),
|
||||
value=deployment_metrics.latency_per_output_token,
|
||||
)
|
||||
|
||||
return_val += 1
|
||||
|
||||
return return_val
|
||||
except Exception as e:
|
||||
return 0
|
||||
|
||||
async def send_daily_reports(self, router) -> bool:
|
||||
"""
|
||||
Send a daily report on:
|
||||
- Top 5 deployments with most failed requests
|
||||
- Top 5 slowest deployments (normalized by latency/output tokens)
|
||||
|
||||
Get the value from redis cache (if available) or in-memory and send it
|
||||
|
||||
Cleanup:
|
||||
- reset values in cache -> prevent memory leak
|
||||
|
||||
Returns:
|
||||
True -> if successfuly sent
|
||||
False -> if not sent
|
||||
"""
|
||||
|
||||
ids = router.get_model_ids()
|
||||
|
||||
# get keys
|
||||
failed_request_keys = [
|
||||
"{}:{}".format(id, SlackAlertingCacheKeys.failed_requests_key.value)
|
||||
for id in ids
|
||||
]
|
||||
latency_keys = [
|
||||
"{}:{}".format(id, SlackAlertingCacheKeys.latency_key.value) for id in ids
|
||||
]
|
||||
|
||||
combined_metrics_keys = failed_request_keys + latency_keys # reduce cache calls
|
||||
|
||||
combined_metrics_values = await self.internal_usage_cache.async_batch_get_cache(
|
||||
keys=combined_metrics_keys
|
||||
) # [1, 2, None, ..]
|
||||
|
||||
all_none = True
|
||||
for val in combined_metrics_values:
|
||||
if val is not None:
|
||||
all_none = False
|
||||
|
||||
if all_none:
|
||||
return False
|
||||
|
||||
failed_request_values = combined_metrics_values[
|
||||
: len(failed_request_keys)
|
||||
] # # [1, 2, None, ..]
|
||||
latency_values = combined_metrics_values[len(failed_request_keys) :]
|
||||
|
||||
# find top 5 failed
|
||||
## Replace None values with a placeholder value (-1 in this case)
|
||||
placeholder_value = 0
|
||||
replaced_failed_values = [
|
||||
value if value is not None else placeholder_value
|
||||
for value in failed_request_values
|
||||
]
|
||||
|
||||
## Get the indices of top 5 keys with the highest numerical values (ignoring None values)
|
||||
top_5_failed = sorted(
|
||||
range(len(replaced_failed_values)),
|
||||
key=lambda i: replaced_failed_values[i],
|
||||
reverse=True,
|
||||
)[:5]
|
||||
|
||||
# find top 5 slowest
|
||||
# Replace None values with a placeholder value (-1 in this case)
|
||||
placeholder_value = 0
|
||||
replaced_slowest_values = [
|
||||
value if value is not None else placeholder_value
|
||||
for value in latency_values
|
||||
]
|
||||
|
||||
# Get the indices of top 5 values with the highest numerical values (ignoring None values)
|
||||
top_5_slowest = sorted(
|
||||
range(len(replaced_slowest_values)),
|
||||
key=lambda i: replaced_slowest_values[i],
|
||||
reverse=True,
|
||||
)[:5]
|
||||
|
||||
# format alert -> return the litellm model name + api base
|
||||
message = f"\n\nHere are today's key metrics 📈: \n\n"
|
||||
|
||||
message += "\n\n*❗️ Top 5 Deployments with Most Failed Requests:*\n\n"
|
||||
for i in range(len(top_5_failed)):
|
||||
key = failed_request_keys[top_5_failed[i]].split(":")[0]
|
||||
_deployment = router.get_model_info(key)
|
||||
if isinstance(_deployment, dict):
|
||||
deployment_name = _deployment["litellm_params"].get("model", "")
|
||||
else:
|
||||
return False
|
||||
|
||||
api_base = litellm.get_api_base(
|
||||
model=deployment_name,
|
||||
optional_params=(
|
||||
_deployment["litellm_params"] if _deployment is not None else {}
|
||||
),
|
||||
)
|
||||
if api_base is None:
|
||||
api_base = ""
|
||||
value = replaced_failed_values[top_5_failed[i]]
|
||||
message += f"\t{i+1}. Deployment: `{deployment_name}`, Failed Requests: `{value}`, API Base: `{api_base}`\n"
|
||||
|
||||
message += "\n\n*😅 Top 5 Slowest Deployments:*\n\n"
|
||||
for i in range(len(top_5_slowest)):
|
||||
key = latency_keys[top_5_slowest[i]].split(":")[0]
|
||||
_deployment = router.get_model_info(key)
|
||||
if _deployment is not None:
|
||||
deployment_name = _deployment["litellm_params"].get("model", "")
|
||||
else:
|
||||
deployment_name = ""
|
||||
api_base = litellm.get_api_base(
|
||||
model=deployment_name,
|
||||
optional_params=(
|
||||
_deployment["litellm_params"] if _deployment is not None else {}
|
||||
),
|
||||
)
|
||||
value = round(replaced_slowest_values[top_5_slowest[i]], 3)
|
||||
message += f"\t{i+1}. Deployment: `{deployment_name}`, Latency per output token: `{value}s/token`, API Base: `{api_base}`\n\n"
|
||||
|
||||
# cache cleanup -> reset values to 0
|
||||
latency_cache_keys = [(key, 0) for key in latency_keys]
|
||||
failed_request_cache_keys = [(key, 0) for key in failed_request_keys]
|
||||
combined_metrics_cache_keys = latency_cache_keys + failed_request_cache_keys
|
||||
await self.internal_usage_cache.async_batch_set_cache(
|
||||
cache_list=combined_metrics_cache_keys
|
||||
)
|
||||
|
||||
# send alert
|
||||
await self.send_alert(message=message, level="Low", alert_type="daily_reports")
|
||||
|
||||
return True
|
||||
|
||||
async def response_taking_too_long(
|
||||
self,
|
||||
|
@ -255,6 +498,11 @@ class SlackAlerting:
|
|||
# in that case we fallback to the api base set in the request metadata
|
||||
_metadata = request_data["metadata"]
|
||||
_api_base = _metadata.get("api_base", "")
|
||||
|
||||
request_info = litellm.utils._add_key_name_and_team_to_alert(
|
||||
request_info=request_info, metadata=_metadata
|
||||
)
|
||||
|
||||
if _api_base is None:
|
||||
_api_base = ""
|
||||
request_info += f"\nAPI Base: `{_api_base}`"
|
||||
|
@ -264,14 +512,13 @@ class SlackAlerting:
|
|||
)
|
||||
|
||||
if "langfuse" in litellm.success_callback:
|
||||
request_info = self._add_langfuse_trace_id_to_alert(
|
||||
request_info=request_info,
|
||||
langfuse_url = self._add_langfuse_trace_id_to_alert(
|
||||
request_data=request_data,
|
||||
type="hanging_request",
|
||||
start_time=start_time,
|
||||
end_time=end_time,
|
||||
)
|
||||
|
||||
if langfuse_url is not None:
|
||||
request_info += "\n🪢 Langfuse Trace: {}".format(langfuse_url)
|
||||
|
||||
# add deployment latencies to alert
|
||||
_deployment_latency_map = self._get_deployment_latencies_to_alert(
|
||||
metadata=request_data.get("metadata", {})
|
||||
|
@ -404,6 +651,53 @@ class SlackAlerting:
|
|||
|
||||
return
|
||||
|
||||
async def model_added_alert(self, model_name: str, litellm_model_name: str):
|
||||
model_info = litellm.model_cost.get(litellm_model_name, {})
|
||||
model_info_str = ""
|
||||
for k, v in model_info.items():
|
||||
if k == "input_cost_per_token" or k == "output_cost_per_token":
|
||||
# when converting to string it should not be 1.63e-06
|
||||
v = "{:.8f}".format(v)
|
||||
|
||||
model_info_str += f"{k}: {v}\n"
|
||||
|
||||
message = f"""
|
||||
*🚅 New Model Added*
|
||||
Model Name: `{model_name}`
|
||||
|
||||
Usage OpenAI Python SDK:
|
||||
```
|
||||
import openai
|
||||
client = openai.OpenAI(
|
||||
api_key="your_api_key",
|
||||
base_url={os.getenv("PROXY_BASE_URL", "http://0.0.0.0:4000")}
|
||||
)
|
||||
|
||||
response = client.chat.completions.create(
|
||||
model="{model_name}", # model to send to the proxy
|
||||
messages = [
|
||||
{{
|
||||
"role": "user",
|
||||
"content": "this is a test request, write a short poem"
|
||||
}}
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
Model Info:
|
||||
```
|
||||
{model_info_str}
|
||||
```
|
||||
"""
|
||||
|
||||
await self.send_alert(
|
||||
message=message, level="Low", alert_type="new_model_added"
|
||||
)
|
||||
pass
|
||||
|
||||
async def model_removed_alert(self, model_name: str):
|
||||
pass
|
||||
|
||||
async def send_alert(
|
||||
self,
|
||||
message: str,
|
||||
|
@ -414,7 +708,11 @@ class SlackAlerting:
|
|||
"llm_requests_hanging",
|
||||
"budget_alerts",
|
||||
"db_exceptions",
|
||||
"daily_reports",
|
||||
"new_model_added",
|
||||
"cooldown_deployment",
|
||||
],
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
Alerting based on thresholds: - https://github.com/BerriAI/litellm/issues/1298
|
||||
|
@ -439,9 +737,16 @@ class SlackAlerting:
|
|||
# Get the current timestamp
|
||||
current_time = datetime.now().strftime("%H:%M:%S")
|
||||
_proxy_base_url = os.getenv("PROXY_BASE_URL", None)
|
||||
formatted_message = (
|
||||
f"Level: `{level}`\nTimestamp: `{current_time}`\n\nMessage: {message}"
|
||||
)
|
||||
if alert_type == "daily_reports" or alert_type == "new_model_added":
|
||||
formatted_message = message
|
||||
else:
|
||||
formatted_message = (
|
||||
f"Level: `{level}`\nTimestamp: `{current_time}`\n\nMessage: {message}"
|
||||
)
|
||||
|
||||
if kwargs:
|
||||
for key, value in kwargs.items():
|
||||
formatted_message += f"\n\n{key}: `{value}`\n\n"
|
||||
if _proxy_base_url is not None:
|
||||
formatted_message += f"\n\nProxy URL: `{_proxy_base_url}`"
|
||||
|
||||
|
@ -451,6 +756,8 @@ class SlackAlerting:
|
|||
and alert_type in self.alert_to_webhook_url
|
||||
):
|
||||
slack_webhook_url = self.alert_to_webhook_url[alert_type]
|
||||
elif self.default_webhook_url is not None:
|
||||
slack_webhook_url = self.default_webhook_url
|
||||
else:
|
||||
slack_webhook_url = os.getenv("SLACK_WEBHOOK_URL", None)
|
||||
|
||||
|
@ -468,3 +775,113 @@ class SlackAlerting:
|
|||
pass
|
||||
else:
|
||||
print("Error sending slack alert. Error=", response.text) # noqa
|
||||
|
||||
async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
|
||||
"""Log deployment latency"""
|
||||
if "daily_reports" in self.alert_types:
|
||||
model_id = (
|
||||
kwargs.get("litellm_params", {}).get("model_info", {}).get("id", "")
|
||||
)
|
||||
response_s: timedelta = end_time - start_time
|
||||
|
||||
final_value = response_s
|
||||
total_tokens = 0
|
||||
|
||||
if isinstance(response_obj, litellm.ModelResponse):
|
||||
completion_tokens = response_obj.usage.completion_tokens
|
||||
final_value = float(response_s.total_seconds() / completion_tokens)
|
||||
|
||||
await self.async_update_daily_reports(
|
||||
DeploymentMetrics(
|
||||
id=model_id,
|
||||
failed_request=False,
|
||||
latency_per_output_token=final_value,
|
||||
updated_at=litellm.utils.get_utc_datetime(),
|
||||
)
|
||||
)
|
||||
|
||||
async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time):
|
||||
"""Log failure + deployment latency"""
|
||||
if "daily_reports" in self.alert_types:
|
||||
model_id = (
|
||||
kwargs.get("litellm_params", {}).get("model_info", {}).get("id", "")
|
||||
)
|
||||
await self.async_update_daily_reports(
|
||||
DeploymentMetrics(
|
||||
id=model_id,
|
||||
failed_request=True,
|
||||
latency_per_output_token=None,
|
||||
updated_at=litellm.utils.get_utc_datetime(),
|
||||
)
|
||||
)
|
||||
if "llm_exceptions" in self.alert_types:
|
||||
original_exception = kwargs.get("exception", None)
|
||||
|
||||
await self.send_alert(
|
||||
message="LLM API Failure - " + str(original_exception),
|
||||
level="High",
|
||||
alert_type="llm_exceptions",
|
||||
)
|
||||
|
||||
async def _run_scheduler_helper(self, llm_router) -> bool:
|
||||
"""
|
||||
Returns:
|
||||
- True -> report sent
|
||||
- False -> report not sent
|
||||
"""
|
||||
report_sent_bool = False
|
||||
|
||||
report_sent = await self.internal_usage_cache.async_get_cache(
|
||||
key=SlackAlertingCacheKeys.report_sent_key.value
|
||||
) # None | datetime
|
||||
|
||||
current_time = litellm.utils.get_utc_datetime()
|
||||
|
||||
if report_sent is None:
|
||||
_current_time = current_time.isoformat()
|
||||
await self.internal_usage_cache.async_set_cache(
|
||||
key=SlackAlertingCacheKeys.report_sent_key.value,
|
||||
value=_current_time,
|
||||
)
|
||||
else:
|
||||
# check if current time - interval >= time last sent
|
||||
delta = current_time - timedelta(
|
||||
seconds=self.alerting_args.daily_report_frequency
|
||||
)
|
||||
|
||||
if isinstance(report_sent, str):
|
||||
report_sent = dt.fromisoformat(report_sent)
|
||||
|
||||
if delta >= report_sent:
|
||||
# Sneak in the reporting logic here
|
||||
await self.send_daily_reports(router=llm_router)
|
||||
# Also, don't forget to update the report_sent time after sending the report!
|
||||
_current_time = current_time.isoformat()
|
||||
await self.internal_usage_cache.async_set_cache(
|
||||
key=SlackAlertingCacheKeys.report_sent_key.value,
|
||||
value=_current_time,
|
||||
)
|
||||
report_sent_bool = True
|
||||
|
||||
return report_sent_bool
|
||||
|
||||
async def _run_scheduled_daily_report(self, llm_router: Optional[Any] = None):
|
||||
"""
|
||||
If 'daily_reports' enabled
|
||||
|
||||
Ping redis cache every 5 minutes to check if we should send the report
|
||||
|
||||
If yes -> call send_daily_report()
|
||||
"""
|
||||
if llm_router is None or self.alert_types is None:
|
||||
return
|
||||
|
||||
if "daily_reports" in self.alert_types:
|
||||
while True:
|
||||
await self._run_scheduler_helper(llm_router=llm_router)
|
||||
interval = random.randint(
|
||||
self.alerting_args.report_check_interval - 3,
|
||||
self.alerting_args.report_check_interval + 3,
|
||||
) # shuffle to prevent collisions
|
||||
await asyncio.sleep(interval)
|
||||
return
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
# On success + failure, log events to Supabase
|
||||
|
||||
import dotenv, os
|
||||
import requests
|
||||
import requests # type: ignore
|
||||
|
||||
dotenv.load_dotenv() # Loading env variables using dotenv
|
||||
import traceback
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
import os, types, traceback
|
||||
import json
|
||||
from enum import Enum
|
||||
import requests
|
||||
import time, httpx
|
||||
import requests # type: ignore
|
||||
import time, httpx # type: ignore
|
||||
from typing import Callable, Optional
|
||||
from litellm.utils import ModelResponse, Choices, Message
|
||||
import litellm
|
||||
|
|
|
@ -1,12 +1,12 @@
|
|||
import os, types
|
||||
import json
|
||||
from enum import Enum
|
||||
import requests
|
||||
import requests # type: ignore
|
||||
import time
|
||||
from typing import Callable, Optional
|
||||
import litellm
|
||||
from litellm.utils import ModelResponse, Choices, Message, Usage
|
||||
import httpx
|
||||
import httpx # type: ignore
|
||||
|
||||
|
||||
class AlephAlphaError(Exception):
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import os, types
|
||||
import json
|
||||
from enum import Enum
|
||||
import requests, copy
|
||||
import requests, copy # type: ignore
|
||||
import time
|
||||
from typing import Callable, Optional, List
|
||||
from litellm.utils import ModelResponse, Usage, map_finish_reason, CustomStreamWrapper
|
||||
|
@ -9,7 +9,7 @@ import litellm
|
|||
from .prompt_templates.factory import prompt_factory, custom_prompt
|
||||
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
|
||||
from .base import BaseLLM
|
||||
import httpx
|
||||
import httpx # type: ignore
|
||||
|
||||
|
||||
class AnthropicConstants(Enum):
|
||||
|
@ -184,11 +184,6 @@ class AnthropicChatCompletion(BaseLLM):
|
|||
message=str(completion_response["error"]),
|
||||
status_code=response.status_code,
|
||||
)
|
||||
elif len(completion_response["content"]) == 0:
|
||||
raise AnthropicError(
|
||||
message="No content in response",
|
||||
status_code=500,
|
||||
)
|
||||
else:
|
||||
text_content = ""
|
||||
tool_calls = []
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Optional, Union, Any
|
||||
from typing import Optional, Union, Any, Literal
|
||||
import types, requests
|
||||
from .base import BaseLLM
|
||||
from litellm.utils import (
|
||||
|
@ -12,7 +12,7 @@ from litellm.utils import (
|
|||
from typing import Callable, Optional, BinaryIO
|
||||
from litellm import OpenAIConfig
|
||||
import litellm, json
|
||||
import httpx
|
||||
import httpx # type: ignore
|
||||
from .custom_httpx.azure_dall_e_2 import CustomHTTPTransport, AsyncCustomHTTPTransport
|
||||
from openai import AzureOpenAI, AsyncAzureOpenAI
|
||||
import uuid
|
||||
|
@ -952,6 +952,81 @@ class AzureChatCompletion(BaseLLM):
|
|||
)
|
||||
raise e
|
||||
|
||||
def get_headers(
|
||||
self,
|
||||
model: Optional[str],
|
||||
api_key: str,
|
||||
api_base: str,
|
||||
api_version: str,
|
||||
timeout: float,
|
||||
mode: str,
|
||||
messages: Optional[list] = None,
|
||||
input: Optional[list] = None,
|
||||
prompt: Optional[str] = None,
|
||||
) -> dict:
|
||||
client_session = litellm.client_session or httpx.Client(
|
||||
transport=CustomHTTPTransport(), # handle dall-e-2 calls
|
||||
)
|
||||
if "gateway.ai.cloudflare.com" in api_base:
|
||||
## build base url - assume api base includes resource name
|
||||
if not api_base.endswith("/"):
|
||||
api_base += "/"
|
||||
api_base += f"{model}"
|
||||
client = AzureOpenAI(
|
||||
base_url=api_base,
|
||||
api_version=api_version,
|
||||
api_key=api_key,
|
||||
timeout=timeout,
|
||||
http_client=client_session,
|
||||
)
|
||||
model = None
|
||||
# cloudflare ai gateway, needs model=None
|
||||
else:
|
||||
client = AzureOpenAI(
|
||||
api_version=api_version,
|
||||
azure_endpoint=api_base,
|
||||
api_key=api_key,
|
||||
timeout=timeout,
|
||||
http_client=client_session,
|
||||
)
|
||||
|
||||
# only run this check if it's not cloudflare ai gateway
|
||||
if model is None and mode != "image_generation":
|
||||
raise Exception("model is not set")
|
||||
|
||||
completion = None
|
||||
|
||||
if messages is None:
|
||||
messages = [{"role": "user", "content": "Hey"}]
|
||||
try:
|
||||
completion = client.chat.completions.with_raw_response.create(
|
||||
model=model, # type: ignore
|
||||
messages=messages, # type: ignore
|
||||
)
|
||||
except Exception as e:
|
||||
raise e
|
||||
response = {}
|
||||
|
||||
if completion is None or not hasattr(completion, "headers"):
|
||||
raise Exception("invalid completion response")
|
||||
|
||||
if (
|
||||
completion.headers.get("x-ratelimit-remaining-requests", None) is not None
|
||||
): # not provided for dall-e requests
|
||||
response["x-ratelimit-remaining-requests"] = completion.headers[
|
||||
"x-ratelimit-remaining-requests"
|
||||
]
|
||||
|
||||
if completion.headers.get("x-ratelimit-remaining-tokens", None) is not None:
|
||||
response["x-ratelimit-remaining-tokens"] = completion.headers[
|
||||
"x-ratelimit-remaining-tokens"
|
||||
]
|
||||
|
||||
if completion.headers.get("x-ms-region", None) is not None:
|
||||
response["x-ms-region"] = completion.headers["x-ms-region"]
|
||||
|
||||
return response
|
||||
|
||||
async def ahealth_check(
|
||||
self,
|
||||
model: Optional[str],
|
||||
|
@ -963,7 +1038,7 @@ class AzureChatCompletion(BaseLLM):
|
|||
messages: Optional[list] = None,
|
||||
input: Optional[list] = None,
|
||||
prompt: Optional[str] = None,
|
||||
):
|
||||
) -> dict:
|
||||
client_session = litellm.aclient_session or httpx.AsyncClient(
|
||||
transport=AsyncCustomHTTPTransport(), # handle dall-e-2 calls
|
||||
)
|
||||
|
@ -1040,4 +1115,8 @@ class AzureChatCompletion(BaseLLM):
|
|||
response["x-ratelimit-remaining-tokens"] = completion.headers[
|
||||
"x-ratelimit-remaining-tokens"
|
||||
]
|
||||
|
||||
if completion.headers.get("x-ms-region", None) is not None:
|
||||
response["x-ms-region"] = completion.headers["x-ms-region"]
|
||||
|
||||
return response
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
from typing import Optional, Union, Any
|
||||
import types, requests
|
||||
import types, requests # type: ignore
|
||||
from .base import BaseLLM
|
||||
from litellm.utils import (
|
||||
ModelResponse,
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import os
|
||||
import json
|
||||
from enum import Enum
|
||||
import requests
|
||||
import requests # type: ignore
|
||||
import time
|
||||
from typing import Callable
|
||||
from litellm.utils import ModelResponse, Usage
|
||||
|
|
|
@ -163,10 +163,9 @@ class AmazonAnthropicClaude3Config:
|
|||
"stop",
|
||||
"temperature",
|
||||
"top_p",
|
||||
"extra_headers"
|
||||
"extra_headers",
|
||||
]
|
||||
|
||||
|
||||
def map_openai_params(self, non_default_params: dict, optional_params: dict):
|
||||
for param, value in non_default_params.items():
|
||||
if param == "max_tokens":
|
||||
|
@ -534,10 +533,12 @@ class AmazonStabilityConfig:
|
|||
|
||||
def add_custom_header(headers):
|
||||
"""Closure to capture the headers and add them."""
|
||||
|
||||
def callback(request, **kwargs):
|
||||
"""Actual callback function that Boto3 will call."""
|
||||
for header_name, header_value in headers.items():
|
||||
request.headers.add_header(header_name, header_value)
|
||||
|
||||
return callback
|
||||
|
||||
|
||||
|
@ -672,7 +673,9 @@ def init_bedrock_client(
|
|||
config=config,
|
||||
)
|
||||
if extra_headers:
|
||||
client.meta.events.register('before-sign.bedrock-runtime.*', add_custom_header(extra_headers))
|
||||
client.meta.events.register(
|
||||
"before-sign.bedrock-runtime.*", add_custom_header(extra_headers)
|
||||
)
|
||||
|
||||
return client
|
||||
|
||||
|
@ -1224,7 +1227,7 @@ def _embedding_func_single(
|
|||
"input_type", "search_document"
|
||||
) # aws bedrock example default - https://us-east-1.console.aws.amazon.com/bedrock/home?region=us-east-1#/providers?model=cohere.embed-english-v3
|
||||
data = {"texts": [input], **inference_params} # type: ignore
|
||||
body = json.dumps(data).encode("utf-8")
|
||||
body = json.dumps(data).encode("utf-8") # type: ignore
|
||||
## LOGGING
|
||||
request_str = f"""
|
||||
response = client.invoke_model(
|
||||
|
@ -1416,7 +1419,7 @@ def image_generation(
|
|||
## LOGGING
|
||||
request_str = f"""
|
||||
response = client.invoke_model(
|
||||
body={body},
|
||||
body={body}, # type: ignore
|
||||
modelId={modelId},
|
||||
accept="application/json",
|
||||
contentType="application/json",
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
import os, types
|
||||
import json
|
||||
from enum import Enum
|
||||
import requests
|
||||
import requests # type: ignore
|
||||
import time
|
||||
from typing import Callable, Optional
|
||||
import litellm
|
||||
import httpx
|
||||
import httpx # type: ignore
|
||||
from litellm.utils import ModelResponse, Usage
|
||||
from .prompt_templates.factory import prompt_factory, custom_prompt
|
||||
|
||||
|
|
|
@ -1,12 +1,12 @@
|
|||
import os, types
|
||||
import json
|
||||
from enum import Enum
|
||||
import requests
|
||||
import requests # type: ignore
|
||||
import time, traceback
|
||||
from typing import Callable, Optional
|
||||
from litellm.utils import ModelResponse, Choices, Message, Usage
|
||||
import litellm
|
||||
import httpx
|
||||
import httpx # type: ignore
|
||||
|
||||
|
||||
class CohereError(Exception):
|
||||
|
|
|
@ -1,12 +1,12 @@
|
|||
import os, types
|
||||
import json
|
||||
from enum import Enum
|
||||
import requests
|
||||
import requests # type: ignore
|
||||
import time, traceback
|
||||
from typing import Callable, Optional
|
||||
from litellm.utils import ModelResponse, Choices, Message, Usage
|
||||
import litellm
|
||||
import httpx
|
||||
import httpx # type: ignore
|
||||
from .prompt_templates.factory import cohere_message_pt
|
||||
|
||||
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import os, types
|
||||
import json
|
||||
from enum import Enum
|
||||
import requests
|
||||
import requests # type: ignore
|
||||
import time, traceback
|
||||
from typing import Callable, Optional, List
|
||||
from litellm.utils import ModelResponse, Choices, Message, Usage
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import os, types
|
||||
import json
|
||||
from enum import Enum
|
||||
import requests
|
||||
import requests # type: ignore
|
||||
import time
|
||||
from typing import Callable, Optional
|
||||
import litellm
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
from itertools import chain
|
||||
import requests, types, time
|
||||
import requests, types, time # type: ignore
|
||||
import json, uuid
|
||||
import traceback
|
||||
from typing import Optional
|
||||
import litellm
|
||||
import httpx, aiohttp, asyncio
|
||||
import httpx, aiohttp, asyncio # type: ignore
|
||||
from .prompt_templates.factory import prompt_factory, custom_prompt
|
||||
|
||||
|
||||
|
@ -220,7 +220,10 @@ def get_ollama_response(
|
|||
tool_calls=[
|
||||
{
|
||||
"id": f"call_{str(uuid.uuid4())}",
|
||||
"function": {"name": function_call["name"], "arguments": json.dumps(function_call["arguments"])},
|
||||
"function": {
|
||||
"name": function_call["name"],
|
||||
"arguments": json.dumps(function_call["arguments"]),
|
||||
},
|
||||
"type": "function",
|
||||
}
|
||||
],
|
||||
|
@ -232,7 +235,9 @@ def get_ollama_response(
|
|||
model_response["created"] = int(time.time())
|
||||
model_response["model"] = "ollama/" + model
|
||||
prompt_tokens = response_json.get("prompt_eval_count", len(encoding.encode(prompt, disallowed_special=()))) # type: ignore
|
||||
completion_tokens = response_json.get("eval_count", len(response_json.get("message",dict()).get("content", "")))
|
||||
completion_tokens = response_json.get(
|
||||
"eval_count", len(response_json.get("message", dict()).get("content", ""))
|
||||
)
|
||||
model_response["usage"] = litellm.Usage(
|
||||
prompt_tokens=prompt_tokens,
|
||||
completion_tokens=completion_tokens,
|
||||
|
@ -273,7 +278,10 @@ def ollama_completion_stream(url, data, logging_obj):
|
|||
tool_calls=[
|
||||
{
|
||||
"id": f"call_{str(uuid.uuid4())}",
|
||||
"function": {"name": function_call["name"], "arguments": json.dumps(function_call["arguments"])},
|
||||
"function": {
|
||||
"name": function_call["name"],
|
||||
"arguments": json.dumps(function_call["arguments"]),
|
||||
},
|
||||
"type": "function",
|
||||
}
|
||||
],
|
||||
|
@ -314,9 +322,10 @@ async def ollama_async_streaming(url, data, model_response, encoding, logging_ob
|
|||
first_chunk_content = first_chunk.choices[0].delta.content or ""
|
||||
response_content = first_chunk_content + "".join(
|
||||
[
|
||||
chunk.choices[0].delta.content
|
||||
async for chunk in streamwrapper
|
||||
if chunk.choices[0].delta.content]
|
||||
chunk.choices[0].delta.content
|
||||
async for chunk in streamwrapper
|
||||
if chunk.choices[0].delta.content
|
||||
]
|
||||
)
|
||||
function_call = json.loads(response_content)
|
||||
delta = litellm.utils.Delta(
|
||||
|
@ -324,7 +333,10 @@ async def ollama_async_streaming(url, data, model_response, encoding, logging_ob
|
|||
tool_calls=[
|
||||
{
|
||||
"id": f"call_{str(uuid.uuid4())}",
|
||||
"function": {"name": function_call["name"], "arguments": json.dumps(function_call["arguments"])},
|
||||
"function": {
|
||||
"name": function_call["name"],
|
||||
"arguments": json.dumps(function_call["arguments"]),
|
||||
},
|
||||
"type": "function",
|
||||
}
|
||||
],
|
||||
|
@ -373,7 +385,10 @@ async def ollama_acompletion(url, data, model_response, encoding, logging_obj):
|
|||
tool_calls=[
|
||||
{
|
||||
"id": f"call_{str(uuid.uuid4())}",
|
||||
"function": {"name": function_call["name"], "arguments": json.dumps(function_call["arguments"])},
|
||||
"function": {
|
||||
"name": function_call["name"],
|
||||
"arguments": json.dumps(function_call["arguments"]),
|
||||
},
|
||||
"type": "function",
|
||||
}
|
||||
],
|
||||
|
@ -387,7 +402,10 @@ async def ollama_acompletion(url, data, model_response, encoding, logging_obj):
|
|||
model_response["created"] = int(time.time())
|
||||
model_response["model"] = "ollama/" + data["model"]
|
||||
prompt_tokens = response_json.get("prompt_eval_count", len(encoding.encode(data["prompt"], disallowed_special=()))) # type: ignore
|
||||
completion_tokens = response_json.get("eval_count", len(response_json.get("message",dict()).get("content", "")))
|
||||
completion_tokens = response_json.get(
|
||||
"eval_count",
|
||||
len(response_json.get("message", dict()).get("content", "")),
|
||||
)
|
||||
model_response["usage"] = litellm.Usage(
|
||||
prompt_tokens=prompt_tokens,
|
||||
completion_tokens=completion_tokens,
|
||||
|
@ -474,3 +492,25 @@ async def ollama_aembeddings(
|
|||
"total_tokens": total_input_tokens,
|
||||
}
|
||||
return model_response
|
||||
|
||||
|
||||
def ollama_embeddings(
|
||||
api_base: str,
|
||||
model: str,
|
||||
prompts: list,
|
||||
optional_params=None,
|
||||
logging_obj=None,
|
||||
model_response=None,
|
||||
encoding=None,
|
||||
):
|
||||
return asyncio.run(
|
||||
ollama_aembeddings(
|
||||
api_base,
|
||||
model,
|
||||
prompts,
|
||||
optional_params,
|
||||
logging_obj,
|
||||
model_response,
|
||||
encoding,
|
||||
)
|
||||
)
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import os
|
||||
import json
|
||||
from enum import Enum
|
||||
import requests
|
||||
import requests # type: ignore
|
||||
import time
|
||||
from typing import Callable, Optional
|
||||
from litellm.utils import ModelResponse, Usage
|
||||
|
|
|
@ -22,7 +22,6 @@ from litellm.utils import (
|
|||
TextCompletionResponse,
|
||||
)
|
||||
from typing import Callable, Optional
|
||||
import aiohttp, requests
|
||||
import litellm
|
||||
from .prompt_templates.factory import prompt_factory, custom_prompt
|
||||
from openai import OpenAI, AsyncOpenAI
|
||||
|
@ -531,6 +530,7 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
model=model,
|
||||
custom_llm_provider="openai",
|
||||
logging_obj=logging_obj,
|
||||
stream_options=data.get("stream_options", None),
|
||||
)
|
||||
return streamwrapper
|
||||
|
||||
|
@ -580,6 +580,7 @@ class OpenAIChatCompletion(BaseLLM):
|
|||
model=model,
|
||||
custom_llm_provider="openai",
|
||||
logging_obj=logging_obj,
|
||||
stream_options=data.get("stream_options", None),
|
||||
)
|
||||
return streamwrapper
|
||||
except (
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import os, types
|
||||
import json
|
||||
from enum import Enum
|
||||
import requests
|
||||
import requests # type: ignore
|
||||
import time
|
||||
from typing import Callable, Optional
|
||||
import litellm
|
||||
|
|
|
@ -981,7 +981,7 @@ def anthropic_messages_pt(messages: list):
|
|||
# add role=tool support to allow function call result/error submission
|
||||
user_message_types = {"user", "tool", "function"}
|
||||
# reformat messages to ensure user/assistant are alternating, if there's either 2 consecutive 'user' messages or 2 consecutive 'assistant' message, merge them.
|
||||
new_messages = []
|
||||
new_messages: list = []
|
||||
msg_i = 0
|
||||
tool_use_param = False
|
||||
while msg_i < len(messages):
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
import os, types
|
||||
import json
|
||||
import requests
|
||||
import requests # type: ignore
|
||||
import time
|
||||
from typing import Callable, Optional
|
||||
from litellm.utils import ModelResponse, Usage
|
||||
import litellm
|
||||
import httpx
|
||||
import httpx # type: ignore
|
||||
from .prompt_templates.factory import prompt_factory, custom_prompt
|
||||
|
||||
|
||||
|
|
|
@ -1,14 +1,14 @@
|
|||
import os, types, traceback
|
||||
from enum import Enum
|
||||
import json
|
||||
import requests
|
||||
import requests # type: ignore
|
||||
import time
|
||||
from typing import Callable, Optional, Any
|
||||
import litellm
|
||||
from litellm.utils import ModelResponse, EmbeddingResponse, get_secret, Usage
|
||||
import sys
|
||||
from copy import deepcopy
|
||||
import httpx
|
||||
import httpx # type: ignore
|
||||
from .prompt_templates.factory import prompt_factory, custom_prompt
|
||||
|
||||
|
||||
|
@ -295,7 +295,7 @@ def completion(
|
|||
EndpointName={model},
|
||||
InferenceComponentName={model_id},
|
||||
ContentType="application/json",
|
||||
Body={data},
|
||||
Body={data}, # type: ignore
|
||||
CustomAttributes="accept_eula=true",
|
||||
)
|
||||
""" # type: ignore
|
||||
|
@ -321,7 +321,7 @@ def completion(
|
|||
response = client.invoke_endpoint(
|
||||
EndpointName={model},
|
||||
ContentType="application/json",
|
||||
Body={data},
|
||||
Body={data}, # type: ignore
|
||||
CustomAttributes="accept_eula=true",
|
||||
)
|
||||
""" # type: ignore
|
||||
|
@ -688,7 +688,7 @@ def embedding(
|
|||
response = client.invoke_endpoint(
|
||||
EndpointName={model},
|
||||
ContentType="application/json",
|
||||
Body={data},
|
||||
Body={data}, # type: ignore
|
||||
CustomAttributes="accept_eula=true",
|
||||
)""" # type: ignore
|
||||
logging_obj.pre_call(
|
||||
|
|
|
@ -6,11 +6,11 @@ Reference: https://docs.together.ai/docs/openai-api-compatibility
|
|||
import os, types
|
||||
import json
|
||||
from enum import Enum
|
||||
import requests
|
||||
import requests # type: ignore
|
||||
import time
|
||||
from typing import Callable, Optional
|
||||
import litellm
|
||||
import httpx
|
||||
import httpx # type: ignore
|
||||
from litellm.utils import ModelResponse, Usage
|
||||
from .prompt_templates.factory import prompt_factory, custom_prompt
|
||||
|
||||
|
|
|
@ -1,12 +1,12 @@
|
|||
import os, types
|
||||
import json
|
||||
from enum import Enum
|
||||
import requests
|
||||
import requests # type: ignore
|
||||
import time
|
||||
from typing import Callable, Optional, Union, List
|
||||
from litellm.utils import ModelResponse, Usage, CustomStreamWrapper, map_finish_reason
|
||||
import litellm, uuid
|
||||
import httpx, inspect
|
||||
import httpx, inspect # type: ignore
|
||||
|
||||
|
||||
class VertexAIError(Exception):
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
import os, types
|
||||
import json
|
||||
from enum import Enum
|
||||
import requests, copy
|
||||
import requests, copy # type: ignore
|
||||
import time, uuid
|
||||
from typing import Callable, Optional, List
|
||||
from litellm.utils import ModelResponse, Usage, map_finish_reason, CustomStreamWrapper
|
||||
|
@ -17,7 +17,7 @@ from .prompt_templates.factory import (
|
|||
extract_between_tags,
|
||||
parse_xml_params,
|
||||
)
|
||||
import httpx
|
||||
import httpx # type: ignore
|
||||
|
||||
|
||||
class VertexAIError(Exception):
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
import os
|
||||
import json
|
||||
from enum import Enum
|
||||
import requests
|
||||
import time, httpx
|
||||
import requests # type: ignore
|
||||
import time, httpx # type: ignore
|
||||
from typing import Callable, Any
|
||||
from litellm.utils import ModelResponse, Usage
|
||||
from .prompt_templates.factory import prompt_factory, custom_prompt
|
||||
|
|
|
@ -3,8 +3,8 @@ import json, types, time # noqa: E401
|
|||
from contextlib import contextmanager
|
||||
from typing import Callable, Dict, Optional, Any, Union, List
|
||||
|
||||
import httpx
|
||||
import requests
|
||||
import httpx # type: ignore
|
||||
import requests # type: ignore
|
||||
import litellm
|
||||
from litellm.utils import ModelResponse, get_secret, Usage
|
||||
|
||||
|
|
|
@ -12,9 +12,9 @@ from typing import Any, Literal, Union, BinaryIO
|
|||
from functools import partial
|
||||
import dotenv, traceback, random, asyncio, time, contextvars
|
||||
from copy import deepcopy
|
||||
|
||||
import httpx
|
||||
import litellm
|
||||
|
||||
from ._logging import verbose_logger
|
||||
from litellm import ( # type: ignore
|
||||
client,
|
||||
|
@ -188,6 +188,7 @@ async def acompletion(
|
|||
top_p: Optional[float] = None,
|
||||
n: Optional[int] = None,
|
||||
stream: Optional[bool] = None,
|
||||
stream_options: Optional[dict] = None,
|
||||
stop=None,
|
||||
max_tokens: Optional[int] = None,
|
||||
presence_penalty: Optional[float] = None,
|
||||
|
@ -207,6 +208,7 @@ async def acompletion(
|
|||
api_version: Optional[str] = None,
|
||||
api_key: Optional[str] = None,
|
||||
model_list: Optional[list] = None, # pass in a list of api_base,keys, etc.
|
||||
extra_headers: Optional[dict] = None,
|
||||
# Optional liteLLM function params
|
||||
**kwargs,
|
||||
):
|
||||
|
@ -224,6 +226,7 @@ async def acompletion(
|
|||
top_p (float, optional): The top-p parameter for nucleus sampling (default is 1.0).
|
||||
n (int, optional): The number of completions to generate (default is 1).
|
||||
stream (bool, optional): If True, return a streaming response (default is False).
|
||||
stream_options (dict, optional): A dictionary containing options for the streaming response. Only use this if stream is True.
|
||||
stop(string/list, optional): - Up to 4 sequences where the LLM API will stop generating further tokens.
|
||||
max_tokens (integer, optional): The maximum number of tokens in the generated completion (default is infinity).
|
||||
presence_penalty (float, optional): It is used to penalize new tokens based on their existence in the text so far.
|
||||
|
@ -261,6 +264,7 @@ async def acompletion(
|
|||
"top_p": top_p,
|
||||
"n": n,
|
||||
"stream": stream,
|
||||
"stream_options": stream_options,
|
||||
"stop": stop,
|
||||
"max_tokens": max_tokens,
|
||||
"presence_penalty": presence_penalty,
|
||||
|
@ -305,6 +309,7 @@ async def acompletion(
|
|||
or custom_llm_provider == "deepinfra"
|
||||
or custom_llm_provider == "perplexity"
|
||||
or custom_llm_provider == "groq"
|
||||
or custom_llm_provider == "deepseek"
|
||||
or custom_llm_provider == "text-completion-openai"
|
||||
or custom_llm_provider == "huggingface"
|
||||
or custom_llm_provider == "ollama"
|
||||
|
@ -457,6 +462,7 @@ def completion(
|
|||
top_p: Optional[float] = None,
|
||||
n: Optional[int] = None,
|
||||
stream: Optional[bool] = None,
|
||||
stream_options: Optional[dict] = None,
|
||||
stop=None,
|
||||
max_tokens: Optional[int] = None,
|
||||
presence_penalty: Optional[float] = None,
|
||||
|
@ -496,6 +502,7 @@ def completion(
|
|||
top_p (float, optional): The top-p parameter for nucleus sampling (default is 1.0).
|
||||
n (int, optional): The number of completions to generate (default is 1).
|
||||
stream (bool, optional): If True, return a streaming response (default is False).
|
||||
stream_options (dict, optional): A dictionary containing options for the streaming response. Only set this when you set stream: true.
|
||||
stop(string/list, optional): - Up to 4 sequences where the LLM API will stop generating further tokens.
|
||||
max_tokens (integer, optional): The maximum number of tokens in the generated completion (default is infinity).
|
||||
presence_penalty (float, optional): It is used to penalize new tokens based on their existence in the text so far.
|
||||
|
@ -573,6 +580,7 @@ def completion(
|
|||
"top_p",
|
||||
"n",
|
||||
"stream",
|
||||
"stream_options",
|
||||
"stop",
|
||||
"max_tokens",
|
||||
"presence_penalty",
|
||||
|
@ -648,6 +656,8 @@ def completion(
|
|||
"base_model",
|
||||
"stream_timeout",
|
||||
"supports_system_message",
|
||||
"region_name",
|
||||
"allowed_model_region",
|
||||
]
|
||||
default_params = openai_params + litellm_params
|
||||
non_default_params = {
|
||||
|
@ -783,6 +793,7 @@ def completion(
|
|||
top_p=top_p,
|
||||
n=n,
|
||||
stream=stream,
|
||||
stream_options=stream_options,
|
||||
stop=stop,
|
||||
max_tokens=max_tokens,
|
||||
presence_penalty=presence_penalty,
|
||||
|
@ -982,6 +993,7 @@ def completion(
|
|||
or custom_llm_provider == "deepinfra"
|
||||
or custom_llm_provider == "perplexity"
|
||||
or custom_llm_provider == "groq"
|
||||
or custom_llm_provider == "deepseek"
|
||||
or custom_llm_provider == "anyscale"
|
||||
or custom_llm_provider == "mistral"
|
||||
or custom_llm_provider == "openai"
|
||||
|
@ -2565,6 +2577,7 @@ async def aembedding(*args, **kwargs):
|
|||
or custom_llm_provider == "deepinfra"
|
||||
or custom_llm_provider == "perplexity"
|
||||
or custom_llm_provider == "groq"
|
||||
or custom_llm_provider == "deepseek"
|
||||
or custom_llm_provider == "fireworks_ai"
|
||||
or custom_llm_provider == "ollama"
|
||||
or custom_llm_provider == "vertex_ai"
|
||||
|
@ -2714,6 +2727,8 @@ def embedding(
|
|||
"ttl",
|
||||
"cache",
|
||||
"no-log",
|
||||
"region_name",
|
||||
"allowed_model_region",
|
||||
]
|
||||
default_params = openai_params + litellm_params
|
||||
non_default_params = {
|
||||
|
@ -2947,16 +2962,18 @@ def embedding(
|
|||
model=model, # type: ignore
|
||||
llm_provider="ollama", # type: ignore
|
||||
)
|
||||
if aembedding:
|
||||
response = ollama.ollama_aembeddings(
|
||||
api_base=api_base,
|
||||
model=model,
|
||||
prompts=input,
|
||||
encoding=encoding,
|
||||
logging_obj=logging,
|
||||
optional_params=optional_params,
|
||||
model_response=EmbeddingResponse(),
|
||||
)
|
||||
ollama_embeddings_fn = (
|
||||
ollama.ollama_aembeddings if aembedding else ollama.ollama_embeddings
|
||||
)
|
||||
response = ollama_embeddings_fn(
|
||||
api_base=api_base,
|
||||
model=model,
|
||||
prompts=input,
|
||||
encoding=encoding,
|
||||
logging_obj=logging,
|
||||
optional_params=optional_params,
|
||||
model_response=EmbeddingResponse(),
|
||||
)
|
||||
elif custom_llm_provider == "sagemaker":
|
||||
response = sagemaker.embedding(
|
||||
model=model,
|
||||
|
@ -3085,11 +3102,13 @@ async def atext_completion(*args, **kwargs):
|
|||
or custom_llm_provider == "deepinfra"
|
||||
or custom_llm_provider == "perplexity"
|
||||
or custom_llm_provider == "groq"
|
||||
or custom_llm_provider == "deepseek"
|
||||
or custom_llm_provider == "fireworks_ai"
|
||||
or custom_llm_provider == "text-completion-openai"
|
||||
or custom_llm_provider == "huggingface"
|
||||
or custom_llm_provider == "ollama"
|
||||
or custom_llm_provider == "vertex_ai"
|
||||
or custom_llm_provider in litellm.openai_compatible_providers
|
||||
): # currently implemented aiohttp calls for just azure and openai, soon all.
|
||||
# Await normally
|
||||
response = await loop.run_in_executor(None, func_with_context)
|
||||
|
@ -3120,6 +3139,8 @@ async def atext_completion(*args, **kwargs):
|
|||
## TRANSLATE CHAT TO TEXT FORMAT ##
|
||||
if isinstance(response, TextCompletionResponse):
|
||||
return response
|
||||
elif asyncio.iscoroutine(response):
|
||||
response = await response
|
||||
|
||||
text_completion_response = TextCompletionResponse()
|
||||
text_completion_response["id"] = response.get("id", None)
|
||||
|
@ -3581,6 +3602,8 @@ def image_generation(
|
|||
"caching_groups",
|
||||
"ttl",
|
||||
"cache",
|
||||
"region_name",
|
||||
"allowed_model_region",
|
||||
]
|
||||
default_params = openai_params + litellm_params
|
||||
non_default_params = {
|
||||
|
|
|
@ -739,6 +739,24 @@
|
|||
"litellm_provider": "mistral",
|
||||
"mode": "embedding"
|
||||
},
|
||||
"deepseek-chat": {
|
||||
"max_tokens": 4096,
|
||||
"max_input_tokens": 32000,
|
||||
"max_output_tokens": 4096,
|
||||
"input_cost_per_token": 0.00000014,
|
||||
"output_cost_per_token": 0.00000028,
|
||||
"litellm_provider": "deepseek",
|
||||
"mode": "chat"
|
||||
},
|
||||
"deepseek-coder": {
|
||||
"max_tokens": 4096,
|
||||
"max_input_tokens": 16000,
|
||||
"max_output_tokens": 4096,
|
||||
"input_cost_per_token": 0.00000014,
|
||||
"output_cost_per_token": 0.00000028,
|
||||
"litellm_provider": "deepseek",
|
||||
"mode": "chat"
|
||||
},
|
||||
"groq/llama2-70b-4096": {
|
||||
"max_tokens": 4096,
|
||||
"max_input_tokens": 4096,
|
||||
|
@ -1060,8 +1078,8 @@
|
|||
"max_tokens": 8192,
|
||||
"max_input_tokens": 1000000,
|
||||
"max_output_tokens": 8192,
|
||||
"input_cost_per_token": 0,
|
||||
"output_cost_per_token": 0,
|
||||
"input_cost_per_token": 0.000000625,
|
||||
"output_cost_per_token": 0.000001875,
|
||||
"litellm_provider": "vertex_ai-language-models",
|
||||
"mode": "chat",
|
||||
"supports_function_calling": true,
|
||||
|
@ -1072,8 +1090,8 @@
|
|||
"max_tokens": 8192,
|
||||
"max_input_tokens": 1000000,
|
||||
"max_output_tokens": 8192,
|
||||
"input_cost_per_token": 0,
|
||||
"output_cost_per_token": 0,
|
||||
"input_cost_per_token": 0.000000625,
|
||||
"output_cost_per_token": 0.000001875,
|
||||
"litellm_provider": "vertex_ai-language-models",
|
||||
"mode": "chat",
|
||||
"supports_function_calling": true,
|
||||
|
@ -1084,8 +1102,8 @@
|
|||
"max_tokens": 8192,
|
||||
"max_input_tokens": 1000000,
|
||||
"max_output_tokens": 8192,
|
||||
"input_cost_per_token": 0,
|
||||
"output_cost_per_token": 0,
|
||||
"input_cost_per_token": 0.000000625,
|
||||
"output_cost_per_token": 0.000001875,
|
||||
"litellm_provider": "vertex_ai-language-models",
|
||||
"mode": "chat",
|
||||
"supports_function_calling": true,
|
||||
|
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -1 +1 @@
|
|||
!function(){"use strict";var e,t,n,r,o,u,i,c,f,a={},l={};function d(e){var t=l[e];if(void 0!==t)return t.exports;var n=l[e]={id:e,loaded:!1,exports:{}},r=!0;try{a[e](n,n.exports,d),r=!1}finally{r&&delete l[e]}return n.loaded=!0,n.exports}d.m=a,e=[],d.O=function(t,n,r,o){if(n){o=o||0;for(var u=e.length;u>0&&e[u-1][2]>o;u--)e[u]=e[u-1];e[u]=[n,r,o];return}for(var i=1/0,u=0;u<e.length;u++){for(var n=e[u][0],r=e[u][1],o=e[u][2],c=!0,f=0;f<n.length;f++)i>=o&&Object.keys(d.O).every(function(e){return d.O[e](n[f])})?n.splice(f--,1):(c=!1,o<i&&(i=o));if(c){e.splice(u--,1);var a=r();void 0!==a&&(t=a)}}return t},d.n=function(e){var t=e&&e.__esModule?function(){return e.default}:function(){return e};return d.d(t,{a:t}),t},n=Object.getPrototypeOf?function(e){return Object.getPrototypeOf(e)}:function(e){return e.__proto__},d.t=function(e,r){if(1&r&&(e=this(e)),8&r||"object"==typeof e&&e&&(4&r&&e.__esModule||16&r&&"function"==typeof e.then))return e;var o=Object.create(null);d.r(o);var u={};t=t||[null,n({}),n([]),n(n)];for(var i=2&r&&e;"object"==typeof i&&!~t.indexOf(i);i=n(i))Object.getOwnPropertyNames(i).forEach(function(t){u[t]=function(){return e[t]}});return u.default=function(){return e},d.d(o,u),o},d.d=function(e,t){for(var n in t)d.o(t,n)&&!d.o(e,n)&&Object.defineProperty(e,n,{enumerable:!0,get:t[n]})},d.f={},d.e=function(e){return Promise.all(Object.keys(d.f).reduce(function(t,n){return d.f[n](e,t),t},[]))},d.u=function(e){},d.miniCssF=function(e){return"static/css/00c2ddbcd01819c0.css"},d.g=function(){if("object"==typeof globalThis)return globalThis;try{return this||Function("return this")()}catch(e){if("object"==typeof window)return window}}(),d.o=function(e,t){return Object.prototype.hasOwnProperty.call(e,t)},r={},o="_N_E:",d.l=function(e,t,n,u){if(r[e]){r[e].push(t);return}if(void 0!==n)for(var i,c,f=document.getElementsByTagName("script"),a=0;a<f.length;a++){var l=f[a];if(l.getAttribute("src")==e||l.getAttribute("data-webpack")==o+n){i=l;break}}i||(c=!0,(i=document.createElement("script")).charset="utf-8",i.timeout=120,d.nc&&i.setAttribute("nonce",d.nc),i.setAttribute("data-webpack",o+n),i.src=d.tu(e)),r[e]=[t];var s=function(t,n){i.onerror=i.onload=null,clearTimeout(p);var o=r[e];if(delete r[e],i.parentNode&&i.parentNode.removeChild(i),o&&o.forEach(function(e){return e(n)}),t)return t(n)},p=setTimeout(s.bind(null,void 0,{type:"timeout",target:i}),12e4);i.onerror=s.bind(null,i.onerror),i.onload=s.bind(null,i.onload),c&&document.head.appendChild(i)},d.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},d.nmd=function(e){return e.paths=[],e.children||(e.children=[]),e},d.tt=function(){return void 0===u&&(u={createScriptURL:function(e){return e}},"undefined"!=typeof trustedTypes&&trustedTypes.createPolicy&&(u=trustedTypes.createPolicy("nextjs#bundler",u))),u},d.tu=function(e){return d.tt().createScriptURL(e)},d.p="/ui/_next/",i={272:0},d.f.j=function(e,t){var n=d.o(i,e)?i[e]:void 0;if(0!==n){if(n)t.push(n[2]);else if(272!=e){var r=new Promise(function(t,r){n=i[e]=[t,r]});t.push(n[2]=r);var o=d.p+d.u(e),u=Error();d.l(o,function(t){if(d.o(i,e)&&(0!==(n=i[e])&&(i[e]=void 0),n)){var r=t&&("load"===t.type?"missing":t.type),o=t&&t.target&&t.target.src;u.message="Loading chunk "+e+" failed.\n("+r+": "+o+")",u.name="ChunkLoadError",u.type=r,u.request=o,n[1](u)}},"chunk-"+e,e)}else i[e]=0}},d.O.j=function(e){return 0===i[e]},c=function(e,t){var n,r,o=t[0],u=t[1],c=t[2],f=0;if(o.some(function(e){return 0!==i[e]})){for(n in u)d.o(u,n)&&(d.m[n]=u[n]);if(c)var a=c(d)}for(e&&e(t);f<o.length;f++)r=o[f],d.o(i,r)&&i[r]&&i[r][0](),i[r]=0;return d.O(a)},(f=self.webpackChunk_N_E=self.webpackChunk_N_E||[]).forEach(c.bind(null,0)),f.push=c.bind(null,f.push.bind(f))}();
|
||||
!function(){"use strict";var e,t,n,r,o,u,i,c,f,a={},l={};function d(e){var t=l[e];if(void 0!==t)return t.exports;var n=l[e]={id:e,loaded:!1,exports:{}},r=!0;try{a[e](n,n.exports,d),r=!1}finally{r&&delete l[e]}return n.loaded=!0,n.exports}d.m=a,e=[],d.O=function(t,n,r,o){if(n){o=o||0;for(var u=e.length;u>0&&e[u-1][2]>o;u--)e[u]=e[u-1];e[u]=[n,r,o];return}for(var i=1/0,u=0;u<e.length;u++){for(var n=e[u][0],r=e[u][1],o=e[u][2],c=!0,f=0;f<n.length;f++)i>=o&&Object.keys(d.O).every(function(e){return d.O[e](n[f])})?n.splice(f--,1):(c=!1,o<i&&(i=o));if(c){e.splice(u--,1);var a=r();void 0!==a&&(t=a)}}return t},d.n=function(e){var t=e&&e.__esModule?function(){return e.default}:function(){return e};return d.d(t,{a:t}),t},n=Object.getPrototypeOf?function(e){return Object.getPrototypeOf(e)}:function(e){return e.__proto__},d.t=function(e,r){if(1&r&&(e=this(e)),8&r||"object"==typeof e&&e&&(4&r&&e.__esModule||16&r&&"function"==typeof e.then))return e;var o=Object.create(null);d.r(o);var u={};t=t||[null,n({}),n([]),n(n)];for(var i=2&r&&e;"object"==typeof i&&!~t.indexOf(i);i=n(i))Object.getOwnPropertyNames(i).forEach(function(t){u[t]=function(){return e[t]}});return u.default=function(){return e},d.d(o,u),o},d.d=function(e,t){for(var n in t)d.o(t,n)&&!d.o(e,n)&&Object.defineProperty(e,n,{enumerable:!0,get:t[n]})},d.f={},d.e=function(e){return Promise.all(Object.keys(d.f).reduce(function(t,n){return d.f[n](e,t),t},[]))},d.u=function(e){},d.miniCssF=function(e){return"static/css/a1602eb39f799143.css"},d.g=function(){if("object"==typeof globalThis)return globalThis;try{return this||Function("return this")()}catch(e){if("object"==typeof window)return window}}(),d.o=function(e,t){return Object.prototype.hasOwnProperty.call(e,t)},r={},o="_N_E:",d.l=function(e,t,n,u){if(r[e]){r[e].push(t);return}if(void 0!==n)for(var i,c,f=document.getElementsByTagName("script"),a=0;a<f.length;a++){var l=f[a];if(l.getAttribute("src")==e||l.getAttribute("data-webpack")==o+n){i=l;break}}i||(c=!0,(i=document.createElement("script")).charset="utf-8",i.timeout=120,d.nc&&i.setAttribute("nonce",d.nc),i.setAttribute("data-webpack",o+n),i.src=d.tu(e)),r[e]=[t];var s=function(t,n){i.onerror=i.onload=null,clearTimeout(p);var o=r[e];if(delete r[e],i.parentNode&&i.parentNode.removeChild(i),o&&o.forEach(function(e){return e(n)}),t)return t(n)},p=setTimeout(s.bind(null,void 0,{type:"timeout",target:i}),12e4);i.onerror=s.bind(null,i.onerror),i.onload=s.bind(null,i.onload),c&&document.head.appendChild(i)},d.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},d.nmd=function(e){return e.paths=[],e.children||(e.children=[]),e},d.tt=function(){return void 0===u&&(u={createScriptURL:function(e){return e}},"undefined"!=typeof trustedTypes&&trustedTypes.createPolicy&&(u=trustedTypes.createPolicy("nextjs#bundler",u))),u},d.tu=function(e){return d.tt().createScriptURL(e)},d.p="/ui/_next/",i={272:0},d.f.j=function(e,t){var n=d.o(i,e)?i[e]:void 0;if(0!==n){if(n)t.push(n[2]);else if(272!=e){var r=new Promise(function(t,r){n=i[e]=[t,r]});t.push(n[2]=r);var o=d.p+d.u(e),u=Error();d.l(o,function(t){if(d.o(i,e)&&(0!==(n=i[e])&&(i[e]=void 0),n)){var r=t&&("load"===t.type?"missing":t.type),o=t&&t.target&&t.target.src;u.message="Loading chunk "+e+" failed.\n("+r+": "+o+")",u.name="ChunkLoadError",u.type=r,u.request=o,n[1](u)}},"chunk-"+e,e)}else i[e]=0}},d.O.j=function(e){return 0===i[e]},c=function(e,t){var n,r,o=t[0],u=t[1],c=t[2],f=0;if(o.some(function(e){return 0!==i[e]})){for(n in u)d.o(u,n)&&(d.m[n]=u[n]);if(c)var a=c(d)}for(e&&e(t);f<o.length;f++)r=o[f],d.o(i,r)&&i[r]&&i[r][0](),i[r]=0;return d.O(a)},(f=self.webpackChunk_N_E=self.webpackChunk_N_E||[]).forEach(c.bind(null,0)),f.push=c.bind(null,f.push.bind(f))}();
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -1 +1 @@
|
|||
<!DOCTYPE html><html id="__next_error__"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-202e312607f242a1.js" crossorigin=""/><script src="/ui/_next/static/chunks/fd9d1056-dafd44dfa2da140c.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/69-e49705773ae41779.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/main-app-9b4fb13a7db53edf.js" async="" crossorigin=""></script><title>LiteLLM Dashboard</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js" crossorigin="" noModule=""></script></head><body><script src="/ui/_next/static/chunks/webpack-202e312607f242a1.js" crossorigin="" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/00c2ddbcd01819c0.css\",\"style\",{\"crossOrigin\":\"\"}]\n0:\"$L3\"\n"])</script><script>self.__next_f.push([1,"4:I[47690,[],\"\"]\n6:I[77831,[],\"\"]\n7:I[58854,[\"936\",\"static/chunks/2f6dbc85-17d29013b8ff3da5.js\",\"142\",\"static/chunks/142-11990a208bf93746.js\",\"931\",\"static/chunks/app/page-d9bdfedbff191985.js\"],\"\"]\n8:I[5613,[],\"\"]\n9:I[31778,[],\"\"]\nb:I[48955,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"3:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/00c2ddbcd01819c0.css\",\"precedence\":\"next\",\"crossOrigin\":\"\"}]],[\"$\",\"$L4\",null,{\"buildId\":\"e55gTzpa2g2-9SwXgA9Uo\",\"assetPrefix\":\"/ui\",\"initialCanonicalUrl\":\"/\",\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[\"$L5\",[\"$\",\"$L6\",null,{\"propsForComponent\":{\"params\":{}},\"Component\":\"$7\",\"isStaticGeneration\":true}],null]]},[null,[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_c23dc8\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[],\"styles\":null}]}]}],null]],\"initialHead\":[false,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"LiteLLM Dashboard\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script><script>self.__next_f.push([1,""])</script></body></html>
|
||||
<!DOCTYPE html><html id="__next_error__"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-5b257e1ab47d4b4a.js" crossorigin=""/><script src="/ui/_next/static/chunks/fd9d1056-dafd44dfa2da140c.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/69-e49705773ae41779.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/main-app-9b4fb13a7db53edf.js" async="" crossorigin=""></script><title>LiteLLM Dashboard</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js" crossorigin="" noModule=""></script></head><body><script src="/ui/_next/static/chunks/webpack-5b257e1ab47d4b4a.js" crossorigin="" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/a1602eb39f799143.css\",\"style\",{\"crossOrigin\":\"\"}]\n0:\"$L3\"\n"])</script><script>self.__next_f.push([1,"4:I[47690,[],\"\"]\n6:I[77831,[],\"\"]\n7:I[25539,[\"936\",\"static/chunks/2f6dbc85-17d29013b8ff3da5.js\",\"566\",\"static/chunks/566-ccd699ab19124658.js\",\"931\",\"static/chunks/app/page-c804e862b63be987.js\"],\"\"]\n8:I[5613,[],\"\"]\n9:I[31778,[],\"\"]\nb:I[48955,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"3:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/a1602eb39f799143.css\",\"precedence\":\"next\",\"crossOrigin\":\"\"}]],[\"$\",\"$L4\",null,{\"buildId\":\"K8KXTbmuI2ArWjjdMi2iq\",\"assetPrefix\":\"/ui\",\"initialCanonicalUrl\":\"/\",\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[\"$L5\",[\"$\",\"$L6\",null,{\"propsForComponent\":{\"params\":{}},\"Component\":\"$7\",\"isStaticGeneration\":true}],null]]},[null,[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_c23dc8\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[],\"styles\":null}]}]}],null]],\"initialHead\":[false,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"LiteLLM Dashboard\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script><script>self.__next_f.push([1,""])</script></body></html>
|
|
@ -1,7 +1,7 @@
|
|||
2:I[77831,[],""]
|
||||
3:I[58854,["936","static/chunks/2f6dbc85-17d29013b8ff3da5.js","142","static/chunks/142-11990a208bf93746.js","931","static/chunks/app/page-d9bdfedbff191985.js"],""]
|
||||
3:I[25539,["936","static/chunks/2f6dbc85-17d29013b8ff3da5.js","566","static/chunks/566-ccd699ab19124658.js","931","static/chunks/app/page-c804e862b63be987.js"],""]
|
||||
4:I[5613,[],""]
|
||||
5:I[31778,[],""]
|
||||
0:["e55gTzpa2g2-9SwXgA9Uo",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_c23dc8","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/00c2ddbcd01819c0.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
|
||||
0:["K8KXTbmuI2ArWjjdMi2iq",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_c23dc8","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/a1602eb39f799143.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
|
||||
6:[["$","meta","0",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","meta","1",{"charSet":"utf-8"}],["$","title","2",{"children":"LiteLLM Dashboard"}],["$","meta","3",{"name":"description","content":"LiteLLM Proxy Admin UI"}],["$","link","4",{"rel":"icon","href":"/ui/favicon.ico","type":"image/x-icon","sizes":"16x16"}],["$","meta","5",{"name":"next-size-adjust"}]]
|
||||
1:null
|
||||
|
|
|
@ -4,6 +4,22 @@ model_list:
|
|||
api_key: my-fake-key
|
||||
model: openai/my-fake-model
|
||||
model_name: fake-openai-endpoint
|
||||
- litellm_params:
|
||||
api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/
|
||||
api_key: my-fake-key-2
|
||||
model: openai/my-fake-model-2
|
||||
model_name: fake-openai-endpoint
|
||||
- litellm_params:
|
||||
api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/
|
||||
api_key: my-fake-key-3
|
||||
model: openai/my-fake-model-3
|
||||
model_name: fake-openai-endpoint
|
||||
- model_name: gpt-4
|
||||
litellm_params:
|
||||
model: gpt-3.5-turbo
|
||||
- litellm_params:
|
||||
model: together_ai/codellama/CodeLlama-13b-Instruct-hf
|
||||
model_name: CodeLlama-13b-Instruct
|
||||
router_settings:
|
||||
num_retries: 0
|
||||
enable_pre_call_checks: true
|
||||
|
@ -15,8 +31,11 @@ router_settings:
|
|||
routing_strategy: "latency-based-routing"
|
||||
|
||||
litellm_settings:
|
||||
success_callback: ["openmeter"]
|
||||
success_callback: ["langfuse"]
|
||||
|
||||
general_settings:
|
||||
alerting: ["slack"]
|
||||
alert_types: ["llm_exceptions"]
|
||||
alert_types: ["llm_exceptions", "daily_reports"]
|
||||
alerting_args:
|
||||
daily_report_frequency: 60 # every minute
|
||||
report_check_interval: 5 # every 5s
|
|
@ -458,6 +458,27 @@ class UpdateUserRequest(GenerateRequestBase):
|
|||
return values
|
||||
|
||||
|
||||
class NewEndUserRequest(LiteLLMBase):
|
||||
user_id: str
|
||||
alias: Optional[str] = None # human-friendly alias
|
||||
blocked: bool = False # allow/disallow requests for this end-user
|
||||
max_budget: Optional[float] = None
|
||||
budget_id: Optional[str] = None # give either a budget_id or max_budget
|
||||
allowed_model_region: Optional[Literal["eu"]] = (
|
||||
None # require all user requests to use models in this specific region
|
||||
)
|
||||
default_model: Optional[str] = (
|
||||
None # if no equivalent model in allowed region - default all requests to this model
|
||||
)
|
||||
|
||||
@root_validator(pre=True)
|
||||
def check_user_info(cls, values):
|
||||
if values.get("max_budget") is not None and values.get("budget_id") is not None:
|
||||
raise ValueError("Set either 'max_budget' or 'budget_id', not both.")
|
||||
|
||||
return values
|
||||
|
||||
|
||||
class Member(LiteLLMBase):
|
||||
role: Literal["admin", "user"]
|
||||
user_id: Optional[str] = None
|
||||
|
@ -494,6 +515,8 @@ class NewTeamRequest(TeamBase):
|
|||
|
||||
class GlobalEndUsersSpend(LiteLLMBase):
|
||||
api_key: Optional[str] = None
|
||||
startTime: Optional[datetime] = None
|
||||
endTime: Optional[datetime] = None
|
||||
|
||||
|
||||
class TeamMemberAddRequest(LiteLLMBase):
|
||||
|
@ -836,6 +859,7 @@ class UserAPIKeyAuth(
|
|||
|
||||
api_key: Optional[str] = None
|
||||
user_role: Optional[Literal["proxy_admin", "app_owner", "app_user"]] = None
|
||||
allowed_model_region: Optional[Literal["eu"]] = None
|
||||
|
||||
@root_validator(pre=True)
|
||||
def check_api_key(cls, values):
|
||||
|
@ -881,6 +905,8 @@ class LiteLLM_EndUserTable(LiteLLMBase):
|
|||
blocked: bool
|
||||
alias: Optional[str] = None
|
||||
spend: float = 0.0
|
||||
allowed_model_region: Optional[Literal["eu"]] = None
|
||||
default_model: Optional[str] = None
|
||||
litellm_budget_table: Optional[LiteLLM_BudgetTable] = None
|
||||
|
||||
@root_validator(pre=True)
|
||||
|
|
|
@ -206,9 +206,9 @@ async def get_end_user_object(
|
|||
|
||||
if end_user_id is None:
|
||||
return None
|
||||
|
||||
_key = "end_user_id:{}".format(end_user_id)
|
||||
# check if in cache
|
||||
cached_user_obj = user_api_key_cache.async_get_cache(key=end_user_id)
|
||||
cached_user_obj = await user_api_key_cache.async_get_cache(key=_key)
|
||||
if cached_user_obj is not None:
|
||||
if isinstance(cached_user_obj, dict):
|
||||
return LiteLLM_EndUserTable(**cached_user_obj)
|
||||
|
@ -223,7 +223,14 @@ async def get_end_user_object(
|
|||
if response is None:
|
||||
raise Exception
|
||||
|
||||
return LiteLLM_EndUserTable(**response.dict())
|
||||
# save the end-user object to cache
|
||||
await user_api_key_cache.async_set_cache(
|
||||
key="end_user_id:{}".format(end_user_id), value=response
|
||||
)
|
||||
|
||||
_response = LiteLLM_EndUserTable(**response.dict())
|
||||
|
||||
return _response
|
||||
except Exception as e: # if end-user not in db
|
||||
return None
|
||||
|
||||
|
|
|
@ -15,6 +15,9 @@ from litellm.proxy._types import LiteLLM_JWTAuth, LiteLLM_UserTable
|
|||
from litellm.proxy.utils import PrismaClient
|
||||
from litellm.llms.custom_httpx.httpx_handler import HTTPHandler
|
||||
from typing import Optional
|
||||
from cryptography import x509
|
||||
from cryptography.hazmat.backends import default_backend
|
||||
from cryptography.hazmat.primitives import serialization
|
||||
|
||||
|
||||
class JWTHandler:
|
||||
|
@ -142,8 +145,8 @@ class JWTHandler:
|
|||
public_key = keys[0]
|
||||
elif len(keys) > 1:
|
||||
for key in keys:
|
||||
if kid is not None and key["kid"] == kid:
|
||||
public_key = key
|
||||
if kid is not None and key == kid:
|
||||
public_key = keys[key]
|
||||
|
||||
if public_key is None:
|
||||
raise Exception(
|
||||
|
@ -153,6 +156,11 @@ class JWTHandler:
|
|||
return public_key
|
||||
|
||||
async def auth_jwt(self, token: str) -> dict:
|
||||
audience = os.getenv("JWT_AUDIENCE")
|
||||
decode_options = None
|
||||
if audience is None:
|
||||
decode_options = {"verify_aud": False}
|
||||
|
||||
from jwt.algorithms import RSAAlgorithm
|
||||
|
||||
header = jwt.get_unverified_header(token)
|
||||
|
@ -182,7 +190,33 @@ class JWTHandler:
|
|||
token,
|
||||
public_key_rsa, # type: ignore
|
||||
algorithms=["RS256"],
|
||||
options={"verify_aud": False},
|
||||
options=decode_options,
|
||||
audience=audience,
|
||||
)
|
||||
return payload
|
||||
|
||||
except jwt.ExpiredSignatureError:
|
||||
# the token is expired, do something to refresh it
|
||||
raise Exception("Token Expired")
|
||||
except Exception as e:
|
||||
raise Exception(f"Validation fails: {str(e)}")
|
||||
elif public_key is not None and isinstance(public_key, str):
|
||||
try:
|
||||
cert = x509.load_pem_x509_certificate(public_key.encode(), default_backend())
|
||||
|
||||
# Extract public key
|
||||
key = cert.public_key().public_bytes(
|
||||
serialization.Encoding.PEM,
|
||||
serialization.PublicFormat.SubjectPublicKeyInfo
|
||||
)
|
||||
|
||||
# decode the token using the public key
|
||||
payload = jwt.decode(
|
||||
token,
|
||||
key,
|
||||
algorithms=["RS256"],
|
||||
audience=audience,
|
||||
options=decode_options
|
||||
)
|
||||
return payload
|
||||
|
||||
|
|
|
@ -252,7 +252,7 @@ def run_server(
|
|||
if model and "ollama" in model and api_base is None:
|
||||
run_ollama_serve()
|
||||
if test_async is True:
|
||||
import requests, concurrent, time
|
||||
import requests, concurrent, time # type: ignore
|
||||
|
||||
api_base = f"http://{host}:{port}"
|
||||
|
||||
|
@ -418,7 +418,7 @@ def run_server(
|
|||
read from there and save it to os.env['DATABASE_URL']
|
||||
"""
|
||||
try:
|
||||
import yaml, asyncio
|
||||
import yaml, asyncio # type: ignore
|
||||
except:
|
||||
raise ImportError(
|
||||
"yaml needs to be imported. Run - `pip install 'litellm[proxy]'`"
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -150,6 +150,8 @@ model LiteLLM_EndUserTable {
|
|||
user_id String @id
|
||||
alias String? // admin-facing alias
|
||||
spend Float @default(0.0)
|
||||
allowed_model_region String? // require all user requests to use models in this specific region
|
||||
default_model String? // use along with 'allowed_model_region'. if no available model in region, default to this model.
|
||||
budget_id String?
|
||||
litellm_budget_table LiteLLM_BudgetTable? @relation(fields: [budget_id], references: [budget_id])
|
||||
blocked Boolean @default(false)
|
||||
|
|
|
@ -73,6 +73,7 @@ class ProxyLogging:
|
|||
"llm_requests_hanging",
|
||||
"budget_alerts",
|
||||
"db_exceptions",
|
||||
"daily_reports",
|
||||
]
|
||||
] = [
|
||||
"llm_exceptions",
|
||||
|
@ -80,11 +81,13 @@ class ProxyLogging:
|
|||
"llm_requests_hanging",
|
||||
"budget_alerts",
|
||||
"db_exceptions",
|
||||
"daily_reports",
|
||||
]
|
||||
self.slack_alerting_instance = SlackAlerting(
|
||||
alerting_threshold=self.alerting_threshold,
|
||||
alerting=self.alerting,
|
||||
alert_types=self.alert_types,
|
||||
internal_usage_cache=self.internal_usage_cache,
|
||||
)
|
||||
|
||||
def update_values(
|
||||
|
@ -100,9 +103,11 @@ class ProxyLogging:
|
|||
"llm_requests_hanging",
|
||||
"budget_alerts",
|
||||
"db_exceptions",
|
||||
"daily_reports",
|
||||
]
|
||||
]
|
||||
] = None,
|
||||
alerting_args: Optional[dict] = None,
|
||||
):
|
||||
self.alerting = alerting
|
||||
if alerting_threshold is not None:
|
||||
|
@ -114,8 +119,12 @@ class ProxyLogging:
|
|||
alerting=self.alerting,
|
||||
alerting_threshold=self.alerting_threshold,
|
||||
alert_types=self.alert_types,
|
||||
alerting_args=alerting_args,
|
||||
)
|
||||
|
||||
if "daily_reports" in self.alert_types:
|
||||
litellm.callbacks.append(self.slack_alerting_instance) # type: ignore
|
||||
|
||||
if redis_cache is not None:
|
||||
self.internal_usage_cache.redis_cache = redis_cache
|
||||
|
||||
|
@ -293,6 +302,7 @@ class ProxyLogging:
|
|||
"budget_alerts",
|
||||
"db_exceptions",
|
||||
],
|
||||
request_data: Optional[dict] = None,
|
||||
):
|
||||
"""
|
||||
Alerting based on thresholds: - https://github.com/BerriAI/litellm/issues/1298
|
||||
|
@ -322,10 +332,19 @@ class ProxyLogging:
|
|||
if _proxy_base_url is not None:
|
||||
formatted_message += f"\n\nProxy URL: `{_proxy_base_url}`"
|
||||
|
||||
extra_kwargs = {}
|
||||
if request_data is not None:
|
||||
_url = self.slack_alerting_instance._add_langfuse_trace_id_to_alert(
|
||||
request_data=request_data
|
||||
)
|
||||
if _url is not None:
|
||||
extra_kwargs["🪢 Langfuse Trace"] = _url
|
||||
formatted_message += "\n\n🪢 Langfuse Trace: {}".format(_url)
|
||||
|
||||
for client in self.alerting:
|
||||
if client == "slack":
|
||||
await self.slack_alerting_instance.send_alert(
|
||||
message=message, level=level, alert_type=alert_type
|
||||
message=message, level=level, alert_type=alert_type, **extra_kwargs
|
||||
)
|
||||
elif client == "sentry":
|
||||
if litellm.utils.sentry_sdk_instance is not None:
|
||||
|
@ -360,6 +379,7 @@ class ProxyLogging:
|
|||
message=f"DB read/write call failed: {error_message}",
|
||||
level="High",
|
||||
alert_type="db_exceptions",
|
||||
request_data={},
|
||||
)
|
||||
)
|
||||
|
||||
|
@ -375,7 +395,10 @@ class ProxyLogging:
|
|||
litellm.utils.capture_exception(error=original_exception)
|
||||
|
||||
async def post_call_failure_hook(
|
||||
self, original_exception: Exception, user_api_key_dict: UserAPIKeyAuth
|
||||
self,
|
||||
original_exception: Exception,
|
||||
user_api_key_dict: UserAPIKeyAuth,
|
||||
request_data: dict,
|
||||
):
|
||||
"""
|
||||
Allows users to raise custom exceptions/log when a call fails, without having to deal with parsing Request body.
|
||||
|
@ -400,6 +423,7 @@ class ProxyLogging:
|
|||
message=f"LLM API call failed: {str(original_exception)}",
|
||||
level="High",
|
||||
alert_type="llm_exceptions",
|
||||
request_data=request_data,
|
||||
)
|
||||
)
|
||||
|
||||
|
@ -502,7 +526,7 @@ class PrismaClient:
|
|||
finally:
|
||||
os.chdir(original_dir)
|
||||
# Now you can import the Prisma Client
|
||||
from prisma import Prisma # type: ignore
|
||||
from prisma import Prisma
|
||||
|
||||
self.db = Prisma() # Client to connect to Prisma db
|
||||
|
||||
|
@ -1665,12 +1689,12 @@ def get_instance_fn(value: str, config_file_path: Optional[str] = None) -> Any:
|
|||
module_file_path = os.path.join(directory, *module_name.split("."))
|
||||
module_file_path += ".py"
|
||||
|
||||
spec = importlib.util.spec_from_file_location(module_name, module_file_path)
|
||||
spec = importlib.util.spec_from_file_location(module_name, module_file_path) # type: ignore
|
||||
if spec is None:
|
||||
raise ImportError(
|
||||
f"Could not find a module specification for {module_file_path}"
|
||||
)
|
||||
module = importlib.util.module_from_spec(spec)
|
||||
module = importlib.util.module_from_spec(spec) # type: ignore
|
||||
spec.loader.exec_module(module) # type: ignore
|
||||
else:
|
||||
# Dynamically import the module
|
||||
|
|
|
@ -21,6 +21,7 @@ from collections import defaultdict
|
|||
from litellm.router_strategy.least_busy import LeastBusyLoggingHandler
|
||||
from litellm.router_strategy.lowest_tpm_rpm import LowestTPMLoggingHandler
|
||||
from litellm.router_strategy.lowest_latency import LowestLatencyLoggingHandler
|
||||
from litellm.router_strategy.lowest_cost import LowestCostLoggingHandler
|
||||
from litellm.router_strategy.lowest_tpm_rpm_v2 import LowestTPMLoggingHandler_v2
|
||||
from litellm.llms.custom_httpx.azure_dall_e_2 import (
|
||||
CustomHTTPTransport,
|
||||
|
@ -31,6 +32,7 @@ from litellm.utils import (
|
|||
CustomStreamWrapper,
|
||||
get_utc_datetime,
|
||||
calculate_max_parallel_requests,
|
||||
_is_region_eu,
|
||||
)
|
||||
import copy
|
||||
from litellm._logging import verbose_router_logger
|
||||
|
@ -43,6 +45,7 @@ from litellm.types.router import (
|
|||
updateDeployment,
|
||||
updateLiteLLMParams,
|
||||
RetryPolicy,
|
||||
AlertingConfig,
|
||||
)
|
||||
from litellm.integrations.custom_logger import CustomLogger
|
||||
|
||||
|
@ -98,9 +101,11 @@ class Router:
|
|||
"least-busy",
|
||||
"usage-based-routing",
|
||||
"latency-based-routing",
|
||||
"cost-based-routing",
|
||||
] = "simple-shuffle",
|
||||
routing_strategy_args: dict = {}, # just for latency-based routing
|
||||
semaphore: Optional[asyncio.Semaphore] = None,
|
||||
alerting_config: Optional[AlertingConfig] = None,
|
||||
) -> None:
|
||||
"""
|
||||
Initialize the Router class with the given parameters for caching, reliability, and routing strategy.
|
||||
|
@ -127,9 +132,9 @@ class Router:
|
|||
retry_after (int): Minimum time to wait before retrying a failed request. Defaults to 0.
|
||||
allowed_fails (Optional[int]): Number of allowed fails before adding to cooldown. Defaults to None.
|
||||
cooldown_time (float): Time to cooldown a deployment after failure in seconds. Defaults to 1.
|
||||
routing_strategy (Literal["simple-shuffle", "least-busy", "usage-based-routing", "latency-based-routing"]): Routing strategy. Defaults to "simple-shuffle".
|
||||
routing_strategy (Literal["simple-shuffle", "least-busy", "usage-based-routing", "latency-based-routing", "cost-based-routing"]): Routing strategy. Defaults to "simple-shuffle".
|
||||
routing_strategy_args (dict): Additional args for latency-based routing. Defaults to {}.
|
||||
|
||||
alerting_config (AlertingConfig): Slack alerting configuration. Defaults to None.
|
||||
Returns:
|
||||
Router: An instance of the litellm.Router class.
|
||||
|
||||
|
@ -314,6 +319,9 @@ class Router:
|
|||
self.model_group_retry_policy: Optional[Dict[str, RetryPolicy]] = (
|
||||
model_group_retry_policy
|
||||
)
|
||||
self.alerting_config: Optional[AlertingConfig] = alerting_config
|
||||
if self.alerting_config is not None:
|
||||
self._initialize_alerting()
|
||||
|
||||
def routing_strategy_init(self, routing_strategy: str, routing_strategy_args: dict):
|
||||
if routing_strategy == "least-busy":
|
||||
|
@ -347,6 +355,14 @@ class Router:
|
|||
)
|
||||
if isinstance(litellm.callbacks, list):
|
||||
litellm.callbacks.append(self.lowestlatency_logger) # type: ignore
|
||||
elif routing_strategy == "cost-based-routing":
|
||||
self.lowestcost_logger = LowestCostLoggingHandler(
|
||||
router_cache=self.cache,
|
||||
model_list=self.model_list,
|
||||
routing_args={},
|
||||
)
|
||||
if isinstance(litellm.callbacks, list):
|
||||
litellm.callbacks.append(self.lowestcost_logger) # type: ignore
|
||||
|
||||
def print_deployment(self, deployment: dict):
|
||||
"""
|
||||
|
@ -1847,6 +1863,10 @@ class Router:
|
|||
self.cache.set_cache(
|
||||
value=cached_value, key=cooldown_key, ttl=cooldown_time
|
||||
)
|
||||
|
||||
self.send_deployment_cooldown_alert(
|
||||
deployment_id=deployment, exception_status=exception_status
|
||||
)
|
||||
else:
|
||||
self.failed_calls.set_cache(
|
||||
key=deployment, value=updated_fails, ttl=cooldown_time
|
||||
|
@ -1980,7 +2000,11 @@ class Router:
|
|||
# user can pass vars directly or they can pas os.environ/AZURE_API_KEY, in which case we will read the env
|
||||
# we do this here because we init clients for Azure, OpenAI and we need to set the right key
|
||||
api_key = litellm_params.get("api_key") or default_api_key
|
||||
if api_key and api_key.startswith("os.environ/"):
|
||||
if (
|
||||
api_key
|
||||
and isinstance(api_key, str)
|
||||
and api_key.startswith("os.environ/")
|
||||
):
|
||||
api_key_env_name = api_key.replace("os.environ/", "")
|
||||
api_key = litellm.get_secret(api_key_env_name)
|
||||
litellm_params["api_key"] = api_key
|
||||
|
@ -2004,6 +2028,7 @@ class Router:
|
|||
if (
|
||||
is_azure_ai_studio_model == True
|
||||
and api_base is not None
|
||||
and isinstance(api_base, str)
|
||||
and not api_base.endswith("/v1/")
|
||||
):
|
||||
# check if it ends with a trailing slash
|
||||
|
@ -2084,13 +2109,14 @@ class Router:
|
|||
organization = litellm.get_secret(organization_env_name)
|
||||
litellm_params["organization"] = organization
|
||||
|
||||
if "azure" in model_name:
|
||||
if api_base is None:
|
||||
if "azure" in model_name and isinstance(api_key, str):
|
||||
if api_base is None or not isinstance(api_base, str):
|
||||
raise ValueError(
|
||||
f"api_base is required for Azure OpenAI. Set it on your config. Model - {model}"
|
||||
)
|
||||
if api_version is None:
|
||||
api_version = "2023-07-01-preview"
|
||||
|
||||
if "gateway.ai.cloudflare.com" in api_base:
|
||||
if not api_base.endswith("/"):
|
||||
api_base += "/"
|
||||
|
@ -2513,7 +2539,7 @@ class Router:
|
|||
self.default_deployment = deployment.to_json(exclude_none=True)
|
||||
|
||||
# Azure GPT-Vision Enhancements, users can pass os.environ/
|
||||
data_sources = deployment.litellm_params.get("dataSources", [])
|
||||
data_sources = deployment.litellm_params.get("dataSources", []) or []
|
||||
|
||||
for data_source in data_sources:
|
||||
params = data_source.get("parameters", {})
|
||||
|
@ -2530,6 +2556,22 @@ class Router:
|
|||
# init OpenAI, Azure clients
|
||||
self.set_client(model=deployment.to_json(exclude_none=True))
|
||||
|
||||
# set region (if azure model)
|
||||
try:
|
||||
if "azure" in deployment.litellm_params.model:
|
||||
region = litellm.utils.get_model_region(
|
||||
litellm_params=deployment.litellm_params, mode=None
|
||||
)
|
||||
|
||||
deployment.litellm_params.region_name = region
|
||||
except Exception as e:
|
||||
verbose_router_logger.error(
|
||||
"Unable to get the region for azure model - {}, {}".format(
|
||||
deployment.litellm_params.model, str(e)
|
||||
)
|
||||
)
|
||||
pass # [NON-BLOCKING]
|
||||
|
||||
return deployment
|
||||
|
||||
def add_deployment(self, deployment: Deployment) -> Optional[Deployment]:
|
||||
|
@ -2557,6 +2599,38 @@ class Router:
|
|||
self.model_names.append(deployment.model_name)
|
||||
return deployment
|
||||
|
||||
def upsert_deployment(self, deployment: Deployment) -> Deployment:
|
||||
"""
|
||||
Add or update deployment
|
||||
Parameters:
|
||||
- deployment: Deployment - the deployment to be added to the Router
|
||||
|
||||
Returns:
|
||||
- The added/updated deployment
|
||||
"""
|
||||
# check if deployment already exists
|
||||
|
||||
if deployment.model_info.id in self.get_model_ids():
|
||||
# remove the previous deployment
|
||||
removal_idx: Optional[int] = None
|
||||
for idx, model in enumerate(self.model_list):
|
||||
if model["model_info"]["id"] == deployment.model_info.id:
|
||||
removal_idx = idx
|
||||
|
||||
if removal_idx is not None:
|
||||
self.model_list.pop(removal_idx)
|
||||
|
||||
# add to model list
|
||||
_deployment = deployment.to_json(exclude_none=True)
|
||||
self.model_list.append(_deployment)
|
||||
|
||||
# initialize client
|
||||
self._add_deployment(deployment=deployment)
|
||||
|
||||
# add to model names
|
||||
self.model_names.append(deployment.model_name)
|
||||
return deployment
|
||||
|
||||
def delete_deployment(self, id: str) -> Optional[Deployment]:
|
||||
"""
|
||||
Parameters:
|
||||
|
@ -2580,11 +2654,21 @@ class Router:
|
|||
except:
|
||||
return None
|
||||
|
||||
def get_deployment(self, model_id: str):
|
||||
def get_deployment(self, model_id: str) -> Optional[Deployment]:
|
||||
"""
|
||||
Returns -> Deployment or None
|
||||
|
||||
Raise Exception -> if model found in invalid format
|
||||
"""
|
||||
for model in self.model_list:
|
||||
if "model_info" in model and "id" in model["model_info"]:
|
||||
if model_id == model["model_info"]["id"]:
|
||||
return model
|
||||
if isinstance(model, dict):
|
||||
return Deployment(**model)
|
||||
elif isinstance(model, Deployment):
|
||||
return model
|
||||
else:
|
||||
raise Exception("Model invalid format - {}".format(type(model)))
|
||||
return None
|
||||
|
||||
def get_model_info(self, id: str) -> Optional[dict]:
|
||||
|
@ -2597,7 +2681,10 @@ class Router:
|
|||
return model
|
||||
return None
|
||||
|
||||
def get_model_ids(self):
|
||||
def get_model_ids(self) -> List[str]:
|
||||
"""
|
||||
Returns list of model id's.
|
||||
"""
|
||||
ids = []
|
||||
for model in self.model_list:
|
||||
if "model_info" in model and "id" in model["model_info"]:
|
||||
|
@ -2605,7 +2692,7 @@ class Router:
|
|||
ids.append(id)
|
||||
return ids
|
||||
|
||||
def get_model_names(self):
|
||||
def get_model_names(self) -> List[str]:
|
||||
return self.model_names
|
||||
|
||||
def get_model_list(self):
|
||||
|
@ -2631,6 +2718,7 @@ class Router:
|
|||
"retry_after",
|
||||
"fallbacks",
|
||||
"context_window_fallbacks",
|
||||
"model_group_retry_policy",
|
||||
]
|
||||
|
||||
for var in vars_to_include:
|
||||
|
@ -2656,6 +2744,7 @@ class Router:
|
|||
"retry_after",
|
||||
"fallbacks",
|
||||
"context_window_fallbacks",
|
||||
"model_group_retry_policy",
|
||||
]
|
||||
|
||||
_int_settings = [
|
||||
|
@ -2754,14 +2843,17 @@ class Router:
|
|||
model: str,
|
||||
healthy_deployments: List,
|
||||
messages: List[Dict[str, str]],
|
||||
allowed_model_region: Optional[Literal["eu"]] = None,
|
||||
):
|
||||
"""
|
||||
Filter out model in model group, if:
|
||||
|
||||
- model context window < message length
|
||||
- filter models above rpm limits
|
||||
- if region given, filter out models not in that region / unknown region
|
||||
- [TODO] function call and model doesn't support function calling
|
||||
"""
|
||||
|
||||
verbose_router_logger.debug(
|
||||
f"Starting Pre-call checks for deployments in model={model}"
|
||||
)
|
||||
|
@ -2812,9 +2904,9 @@ class Router:
|
|||
except Exception as e:
|
||||
verbose_router_logger.debug("An error occurs - {}".format(str(e)))
|
||||
|
||||
## RPM CHECK ##
|
||||
_litellm_params = deployment.get("litellm_params", {})
|
||||
model_id = deployment.get("model_info", {}).get("id", "")
|
||||
## RPM CHECK ##
|
||||
### get local router cache ###
|
||||
current_request_cache_local = (
|
||||
self.cache.get_cache(key=model_id, local_only=True) or 0
|
||||
|
@ -2842,6 +2934,28 @@ class Router:
|
|||
_rate_limit_error = True
|
||||
continue
|
||||
|
||||
## REGION CHECK ##
|
||||
if allowed_model_region is not None:
|
||||
if _litellm_params.get("region_name") is not None and isinstance(
|
||||
_litellm_params["region_name"], str
|
||||
):
|
||||
# check if in allowed_model_region
|
||||
if (
|
||||
_is_region_eu(model_region=_litellm_params["region_name"])
|
||||
== False
|
||||
):
|
||||
invalid_model_indices.append(idx)
|
||||
continue
|
||||
else:
|
||||
verbose_router_logger.debug(
|
||||
"Filtering out model - {}, as model_region=None, and allowed_model_region={}".format(
|
||||
model_id, allowed_model_region
|
||||
)
|
||||
)
|
||||
# filter out since region unknown, and user wants to filter for specific region
|
||||
invalid_model_indices.append(idx)
|
||||
continue
|
||||
|
||||
if len(invalid_model_indices) == len(_returned_deployments):
|
||||
"""
|
||||
- no healthy deployments available b/c context window checks or rate limit error
|
||||
|
@ -2943,6 +3057,7 @@ class Router:
|
|||
if (
|
||||
self.routing_strategy != "usage-based-routing-v2"
|
||||
and self.routing_strategy != "simple-shuffle"
|
||||
and self.routing_strategy != "cost-based-routing"
|
||||
): # prevent regressions for other routing strategies, that don't have async get available deployments implemented.
|
||||
return self.get_available_deployment(
|
||||
model=model,
|
||||
|
@ -2980,10 +3095,31 @@ class Router:
|
|||
|
||||
# filter pre-call checks
|
||||
if self.enable_pre_call_checks and messages is not None:
|
||||
healthy_deployments = self._pre_call_checks(
|
||||
model=model, healthy_deployments=healthy_deployments, messages=messages
|
||||
_allowed_model_region = (
|
||||
request_kwargs.get("allowed_model_region")
|
||||
if request_kwargs is not None
|
||||
else None
|
||||
)
|
||||
|
||||
if _allowed_model_region == "eu":
|
||||
healthy_deployments = self._pre_call_checks(
|
||||
model=model,
|
||||
healthy_deployments=healthy_deployments,
|
||||
messages=messages,
|
||||
allowed_model_region=_allowed_model_region,
|
||||
)
|
||||
else:
|
||||
verbose_router_logger.debug(
|
||||
"Ignoring given 'allowed_model_region'={}. Only 'eu' is allowed".format(
|
||||
_allowed_model_region
|
||||
)
|
||||
)
|
||||
healthy_deployments = self._pre_call_checks(
|
||||
model=model,
|
||||
healthy_deployments=healthy_deployments,
|
||||
messages=messages,
|
||||
)
|
||||
|
||||
if len(healthy_deployments) == 0:
|
||||
raise ValueError(
|
||||
f"{RouterErrors.no_deployments_available.value}, passed model={model}"
|
||||
|
@ -2999,6 +3135,16 @@ class Router:
|
|||
messages=messages,
|
||||
input=input,
|
||||
)
|
||||
if (
|
||||
self.routing_strategy == "cost-based-routing"
|
||||
and self.lowestcost_logger is not None
|
||||
):
|
||||
deployment = await self.lowestcost_logger.async_get_available_deployments(
|
||||
model_group=model,
|
||||
healthy_deployments=healthy_deployments,
|
||||
messages=messages,
|
||||
input=input,
|
||||
)
|
||||
elif self.routing_strategy == "simple-shuffle":
|
||||
# if users pass rpm or tpm, we do a random weighted pick - based on rpm/tpm
|
||||
############## Check if we can do a RPM/TPM based weighted pick #################
|
||||
|
@ -3266,6 +3412,8 @@ class Router:
|
|||
|
||||
if retry_policy is None:
|
||||
return None
|
||||
if isinstance(retry_policy, dict):
|
||||
retry_policy = RetryPolicy(**retry_policy)
|
||||
if (
|
||||
isinstance(exception, litellm.BadRequestError)
|
||||
and retry_policy.BadRequestErrorRetries is not None
|
||||
|
@ -3292,6 +3440,56 @@ class Router:
|
|||
):
|
||||
return retry_policy.ContentPolicyViolationErrorRetries
|
||||
|
||||
def _initialize_alerting(self):
|
||||
from litellm.integrations.slack_alerting import SlackAlerting
|
||||
|
||||
router_alerting_config: AlertingConfig = self.alerting_config
|
||||
|
||||
_slack_alerting_logger = SlackAlerting(
|
||||
alerting_threshold=router_alerting_config.alerting_threshold,
|
||||
alerting=["slack"],
|
||||
default_webhook_url=router_alerting_config.webhook_url,
|
||||
)
|
||||
|
||||
litellm.callbacks.append(_slack_alerting_logger)
|
||||
litellm.success_callback.append(
|
||||
_slack_alerting_logger.response_taking_too_long_callback
|
||||
)
|
||||
print("\033[94m\nInitialized Alerting for litellm.Router\033[0m\n") # noqa
|
||||
|
||||
def send_deployment_cooldown_alert(
|
||||
self, deployment_id: str, exception_status: Union[str, int]
|
||||
):
|
||||
try:
|
||||
from litellm.proxy.proxy_server import proxy_logging_obj
|
||||
|
||||
# trigger slack alert saying deployment is in cooldown
|
||||
if (
|
||||
proxy_logging_obj is not None
|
||||
and proxy_logging_obj.alerting is not None
|
||||
and "slack" in proxy_logging_obj.alerting
|
||||
):
|
||||
_deployment = self.get_deployment(model_id=deployment_id)
|
||||
if _deployment is None:
|
||||
return
|
||||
|
||||
_litellm_params = _deployment["litellm_params"]
|
||||
temp_litellm_params = copy.deepcopy(_litellm_params)
|
||||
temp_litellm_params = dict(temp_litellm_params)
|
||||
_model_name = _deployment.get("model_name", None)
|
||||
_api_base = litellm.get_api_base(
|
||||
model=_model_name, optional_params=temp_litellm_params
|
||||
)
|
||||
asyncio.create_task(
|
||||
proxy_logging_obj.slack_alerting_instance.send_alert(
|
||||
message=f"Router: Cooling down deployment: {_api_base}, for {self.cooldown_time} seconds. Got exception: {str(exception_status)}",
|
||||
alert_type="cooldown_deployment",
|
||||
level="Low",
|
||||
)
|
||||
)
|
||||
except Exception as e:
|
||||
pass
|
||||
|
||||
def flush_cache(self):
|
||||
litellm.cache = None
|
||||
self.cache.flush_cache()
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
# - use litellm.success + failure callbacks to log when a request completed
|
||||
# - in get_available_deployment, for a given model group name -> pick based on traffic
|
||||
|
||||
import dotenv, os, requests, random
|
||||
import dotenv, os, requests, random # type: ignore
|
||||
from typing import Optional
|
||||
|
||||
dotenv.load_dotenv() # Loading env variables using dotenv
|
||||
|
|
350
litellm/router_strategy/lowest_cost.py
Normal file
350
litellm/router_strategy/lowest_cost.py
Normal file
|
@ -0,0 +1,350 @@
|
|||
#### What this does ####
|
||||
# picks based on response time (for streaming, this is time to first token)
|
||||
from pydantic import BaseModel, Extra, Field, root_validator
|
||||
import dotenv, os, requests, random # type: ignore
|
||||
from typing import Optional, Union, List, Dict
|
||||
from datetime import datetime, timedelta
|
||||
import random
|
||||
|
||||
dotenv.load_dotenv() # Loading env variables using dotenv
|
||||
import traceback
|
||||
from litellm.caching import DualCache
|
||||
from litellm.integrations.custom_logger import CustomLogger
|
||||
from litellm._logging import verbose_router_logger
|
||||
from litellm import ModelResponse
|
||||
from litellm import token_counter
|
||||
import litellm
|
||||
|
||||
|
||||
class LiteLLMBase(BaseModel):
|
||||
"""
|
||||
Implements default functions, all pydantic objects should have.
|
||||
"""
|
||||
|
||||
def json(self, **kwargs):
|
||||
try:
|
||||
return self.model_dump() # noqa
|
||||
except:
|
||||
# if using pydantic v1
|
||||
return self.dict()
|
||||
|
||||
|
||||
class LowestCostLoggingHandler(CustomLogger):
|
||||
test_flag: bool = False
|
||||
logged_success: int = 0
|
||||
logged_failure: int = 0
|
||||
|
||||
def __init__(
|
||||
self, router_cache: DualCache, model_list: list, routing_args: dict = {}
|
||||
):
|
||||
self.router_cache = router_cache
|
||||
self.model_list = model_list
|
||||
|
||||
async def log_success_event(self, kwargs, response_obj, start_time, end_time):
|
||||
try:
|
||||
"""
|
||||
Update usage on success
|
||||
"""
|
||||
if kwargs["litellm_params"].get("metadata") is None:
|
||||
pass
|
||||
else:
|
||||
model_group = kwargs["litellm_params"]["metadata"].get(
|
||||
"model_group", None
|
||||
)
|
||||
|
||||
id = kwargs["litellm_params"].get("model_info", {}).get("id", None)
|
||||
if model_group is None or id is None:
|
||||
return
|
||||
elif isinstance(id, int):
|
||||
id = str(id)
|
||||
|
||||
# ------------
|
||||
# Setup values
|
||||
# ------------
|
||||
"""
|
||||
{
|
||||
{model_group}_map: {
|
||||
id: {
|
||||
f"{date:hour:minute}" : {"tpm": 34, "rpm": 3}
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
current_date = datetime.now().strftime("%Y-%m-%d")
|
||||
current_hour = datetime.now().strftime("%H")
|
||||
current_minute = datetime.now().strftime("%M")
|
||||
precise_minute = f"{current_date}-{current_hour}-{current_minute}"
|
||||
cost_key = f"{model_group}_map"
|
||||
|
||||
response_ms: timedelta = end_time - start_time
|
||||
|
||||
final_value = response_ms
|
||||
total_tokens = 0
|
||||
|
||||
if isinstance(response_obj, ModelResponse):
|
||||
completion_tokens = response_obj.usage.completion_tokens
|
||||
total_tokens = response_obj.usage.total_tokens
|
||||
final_value = float(response_ms.total_seconds() / completion_tokens)
|
||||
|
||||
# ------------
|
||||
# Update usage
|
||||
# ------------
|
||||
|
||||
request_count_dict = (
|
||||
await self.router_cache.async_get_cache(key=cost_key) or {}
|
||||
)
|
||||
|
||||
# check local result first
|
||||
|
||||
if id not in request_count_dict:
|
||||
request_count_dict[id] = {}
|
||||
|
||||
if precise_minute not in request_count_dict[id]:
|
||||
request_count_dict[id][precise_minute] = {}
|
||||
|
||||
if precise_minute not in request_count_dict[id]:
|
||||
request_count_dict[id][precise_minute] = {}
|
||||
|
||||
## TPM
|
||||
request_count_dict[id][precise_minute]["tpm"] = (
|
||||
request_count_dict[id][precise_minute].get("tpm", 0) + total_tokens
|
||||
)
|
||||
|
||||
## RPM
|
||||
request_count_dict[id][precise_minute]["rpm"] = (
|
||||
request_count_dict[id][precise_minute].get("rpm", 0) + 1
|
||||
)
|
||||
|
||||
await self.router_cache.async_set_cache(
|
||||
key=cost_key, value=request_count_dict
|
||||
)
|
||||
|
||||
### TESTING ###
|
||||
if self.test_flag:
|
||||
self.logged_success += 1
|
||||
except Exception as e:
|
||||
traceback.print_exc()
|
||||
pass
|
||||
|
||||
async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
|
||||
try:
|
||||
"""
|
||||
Update cost usage on success
|
||||
"""
|
||||
if kwargs["litellm_params"].get("metadata") is None:
|
||||
pass
|
||||
else:
|
||||
model_group = kwargs["litellm_params"]["metadata"].get(
|
||||
"model_group", None
|
||||
)
|
||||
|
||||
id = kwargs["litellm_params"].get("model_info", {}).get("id", None)
|
||||
if model_group is None or id is None:
|
||||
return
|
||||
elif isinstance(id, int):
|
||||
id = str(id)
|
||||
|
||||
# ------------
|
||||
# Setup values
|
||||
# ------------
|
||||
"""
|
||||
{
|
||||
{model_group}_map: {
|
||||
id: {
|
||||
"cost": [..]
|
||||
f"{date:hour:minute}" : {"tpm": 34, "rpm": 3}
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
cost_key = f"{model_group}_map"
|
||||
|
||||
current_date = datetime.now().strftime("%Y-%m-%d")
|
||||
current_hour = datetime.now().strftime("%H")
|
||||
current_minute = datetime.now().strftime("%M")
|
||||
precise_minute = f"{current_date}-{current_hour}-{current_minute}"
|
||||
|
||||
response_ms: timedelta = end_time - start_time
|
||||
|
||||
final_value = response_ms
|
||||
total_tokens = 0
|
||||
|
||||
if isinstance(response_obj, ModelResponse):
|
||||
completion_tokens = response_obj.usage.completion_tokens
|
||||
total_tokens = response_obj.usage.total_tokens
|
||||
final_value = float(response_ms.total_seconds() / completion_tokens)
|
||||
|
||||
# ------------
|
||||
# Update usage
|
||||
# ------------
|
||||
|
||||
request_count_dict = (
|
||||
await self.router_cache.async_get_cache(key=cost_key) or {}
|
||||
)
|
||||
|
||||
if id not in request_count_dict:
|
||||
request_count_dict[id] = {}
|
||||
if precise_minute not in request_count_dict[id]:
|
||||
request_count_dict[id][precise_minute] = {}
|
||||
|
||||
## TPM
|
||||
request_count_dict[id][precise_minute]["tpm"] = (
|
||||
request_count_dict[id][precise_minute].get("tpm", 0) + total_tokens
|
||||
)
|
||||
|
||||
## RPM
|
||||
request_count_dict[id][precise_minute]["rpm"] = (
|
||||
request_count_dict[id][precise_minute].get("rpm", 0) + 1
|
||||
)
|
||||
|
||||
await self.router_cache.async_set_cache(
|
||||
key=cost_key, value=request_count_dict
|
||||
) # reset map within window
|
||||
|
||||
### TESTING ###
|
||||
if self.test_flag:
|
||||
self.logged_success += 1
|
||||
except Exception as e:
|
||||
traceback.print_exc()
|
||||
pass
|
||||
|
||||
async def async_get_available_deployments(
|
||||
self,
|
||||
model_group: str,
|
||||
healthy_deployments: list,
|
||||
messages: Optional[List[Dict[str, str]]] = None,
|
||||
input: Optional[Union[str, List]] = None,
|
||||
request_kwargs: Optional[Dict] = None,
|
||||
):
|
||||
"""
|
||||
Returns a deployment with the lowest cost
|
||||
"""
|
||||
cost_key = f"{model_group}_map"
|
||||
|
||||
request_count_dict = await self.router_cache.async_get_cache(key=cost_key) or {}
|
||||
|
||||
# -----------------------
|
||||
# Find lowest used model
|
||||
# ----------------------
|
||||
lowest_cost = float("inf")
|
||||
|
||||
current_date = datetime.now().strftime("%Y-%m-%d")
|
||||
current_hour = datetime.now().strftime("%H")
|
||||
current_minute = datetime.now().strftime("%M")
|
||||
precise_minute = f"{current_date}-{current_hour}-{current_minute}"
|
||||
|
||||
deployment = None
|
||||
|
||||
if request_count_dict is None: # base case
|
||||
return
|
||||
|
||||
all_deployments = request_count_dict
|
||||
for d in healthy_deployments:
|
||||
## if healthy deployment not yet used
|
||||
if d["model_info"]["id"] not in all_deployments:
|
||||
all_deployments[d["model_info"]["id"]] = {
|
||||
precise_minute: {"tpm": 0, "rpm": 0},
|
||||
}
|
||||
|
||||
try:
|
||||
input_tokens = token_counter(messages=messages, text=input)
|
||||
except:
|
||||
input_tokens = 0
|
||||
|
||||
# randomly sample from all_deployments, incase all deployments have latency=0.0
|
||||
_items = all_deployments.items()
|
||||
|
||||
### GET AVAILABLE DEPLOYMENTS ### filter out any deployments > tpm/rpm limits
|
||||
potential_deployments = []
|
||||
_cost_per_deployment = {}
|
||||
for item, item_map in all_deployments.items():
|
||||
## get the item from model list
|
||||
_deployment = None
|
||||
for m in healthy_deployments:
|
||||
if item == m["model_info"]["id"]:
|
||||
_deployment = m
|
||||
|
||||
if _deployment is None:
|
||||
continue # skip to next one
|
||||
|
||||
_deployment_tpm = (
|
||||
_deployment.get("tpm", None)
|
||||
or _deployment.get("litellm_params", {}).get("tpm", None)
|
||||
or _deployment.get("model_info", {}).get("tpm", None)
|
||||
or float("inf")
|
||||
)
|
||||
|
||||
_deployment_rpm = (
|
||||
_deployment.get("rpm", None)
|
||||
or _deployment.get("litellm_params", {}).get("rpm", None)
|
||||
or _deployment.get("model_info", {}).get("rpm", None)
|
||||
or float("inf")
|
||||
)
|
||||
item_litellm_model_name = _deployment.get("litellm_params", {}).get("model")
|
||||
item_litellm_model_cost_map = litellm.model_cost.get(
|
||||
item_litellm_model_name, {}
|
||||
)
|
||||
|
||||
# check if user provided input_cost_per_token and output_cost_per_token in litellm_params
|
||||
item_input_cost = None
|
||||
item_output_cost = None
|
||||
if _deployment.get("litellm_params", {}).get("input_cost_per_token", None):
|
||||
item_input_cost = _deployment.get("litellm_params", {}).get(
|
||||
"input_cost_per_token"
|
||||
)
|
||||
|
||||
if _deployment.get("litellm_params", {}).get("output_cost_per_token", None):
|
||||
item_output_cost = _deployment.get("litellm_params", {}).get(
|
||||
"output_cost_per_token"
|
||||
)
|
||||
|
||||
if item_input_cost is None:
|
||||
item_input_cost = item_litellm_model_cost_map.get(
|
||||
"input_cost_per_token", 5.0
|
||||
)
|
||||
|
||||
if item_output_cost is None:
|
||||
item_output_cost = item_litellm_model_cost_map.get(
|
||||
"output_cost_per_token", 5.0
|
||||
)
|
||||
|
||||
# if litellm["model"] is not in model_cost map -> use item_cost = $10
|
||||
|
||||
item_cost = item_input_cost + item_output_cost
|
||||
|
||||
item_rpm = item_map.get(precise_minute, {}).get("rpm", 0)
|
||||
item_tpm = item_map.get(precise_minute, {}).get("tpm", 0)
|
||||
|
||||
verbose_router_logger.debug(
|
||||
f"item_cost: {item_cost}, item_tpm: {item_tpm}, item_rpm: {item_rpm}, model_id: {_deployment.get('model_info', {}).get('id')}"
|
||||
)
|
||||
|
||||
# -------------- #
|
||||
# Debugging Logic
|
||||
# -------------- #
|
||||
# We use _cost_per_deployment to log to langfuse, slack - this is not used to make a decision on routing
|
||||
# this helps a user to debug why the router picked a specfic deployment #
|
||||
_deployment_api_base = _deployment.get("litellm_params", {}).get(
|
||||
"api_base", ""
|
||||
)
|
||||
if _deployment_api_base is not None:
|
||||
_cost_per_deployment[_deployment_api_base] = item_cost
|
||||
# -------------- #
|
||||
# End of Debugging Logic
|
||||
# -------------- #
|
||||
|
||||
if (
|
||||
item_tpm + input_tokens > _deployment_tpm
|
||||
or item_rpm + 1 > _deployment_rpm
|
||||
): # if user passed in tpm / rpm in the model_list
|
||||
continue
|
||||
else:
|
||||
potential_deployments.append((_deployment, item_cost))
|
||||
|
||||
if len(potential_deployments) == 0:
|
||||
return None
|
||||
|
||||
potential_deployments = sorted(potential_deployments, key=lambda x: x[1])
|
||||
|
||||
selected_deployment = potential_deployments[0][0]
|
||||
return selected_deployment
|
|
@ -1,7 +1,7 @@
|
|||
#### What this does ####
|
||||
# picks based on response time (for streaming, this is time to first token)
|
||||
from pydantic import BaseModel, Extra, Field, root_validator
|
||||
import dotenv, os, requests, random
|
||||
from pydantic import BaseModel, Extra, Field, root_validator # type: ignore
|
||||
import dotenv, os, requests, random # type: ignore
|
||||
from typing import Optional, Union, List, Dict
|
||||
from datetime import datetime, timedelta
|
||||
import random
|
||||
|
|
File diff suppressed because one or more lines are too long
|
@ -1,5 +1,6 @@
|
|||
import pytest
|
||||
from litellm import acompletion
|
||||
from litellm import completion
|
||||
|
||||
|
||||
def test_acompletion_params():
|
||||
|
@ -7,17 +8,29 @@ def test_acompletion_params():
|
|||
from litellm.types.completion import CompletionRequest
|
||||
|
||||
acompletion_params_odict = inspect.signature(acompletion).parameters
|
||||
acompletion_params = {name: param.annotation for name, param in acompletion_params_odict.items()}
|
||||
completion_params = {field_name: field_type for field_name, field_type in CompletionRequest.__annotations__.items()}
|
||||
completion_params_dict = inspect.signature(completion).parameters
|
||||
|
||||
# remove kwargs
|
||||
acompletion_params.pop("kwargs", None)
|
||||
acompletion_params = {
|
||||
name: param.annotation for name, param in acompletion_params_odict.items()
|
||||
}
|
||||
completion_params = {
|
||||
name: param.annotation for name, param in completion_params_dict.items()
|
||||
}
|
||||
|
||||
keys_acompletion = set(acompletion_params.keys())
|
||||
keys_completion = set(completion_params.keys())
|
||||
|
||||
print(keys_acompletion)
|
||||
print("\n\n\n")
|
||||
print(keys_completion)
|
||||
|
||||
print("diff=", keys_completion - keys_acompletion)
|
||||
|
||||
# Assert that the parameters are the same
|
||||
if keys_acompletion != keys_completion:
|
||||
pytest.fail("The parameters of the acompletion function and the CompletionRequest class are not the same.")
|
||||
pytest.fail(
|
||||
"The parameters of the litellm.acompletion function and litellm.completion are not the same."
|
||||
)
|
||||
|
||||
|
||||
# test_acompletion_params()
|
||||
|
|
|
@ -1,9 +1,11 @@
|
|||
import copy
|
||||
import json
|
||||
import sys
|
||||
import os
|
||||
import io, asyncio
|
||||
import asyncio
|
||||
|
||||
import logging
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
sys.path.insert(0, os.path.abspath("../.."))
|
||||
|
@ -18,6 +20,21 @@ import time
|
|||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def langfuse_client():
|
||||
import langfuse
|
||||
|
||||
langfuse_client = langfuse.Langfuse(
|
||||
public_key=os.environ["LANGFUSE_PUBLIC_KEY"],
|
||||
secret_key=os.environ["LANGFUSE_SECRET_KEY"],
|
||||
)
|
||||
|
||||
with patch(
|
||||
"langfuse.Langfuse", MagicMock(return_value=langfuse_client)
|
||||
) as mock_langfuse_client:
|
||||
yield mock_langfuse_client()
|
||||
|
||||
|
||||
def search_logs(log_file_path, num_good_logs=1):
|
||||
"""
|
||||
Searches the given log file for logs containing the "/api/public" string.
|
||||
|
@ -129,21 +146,10 @@ def test_langfuse_logging_async():
|
|||
pytest.fail(f"An exception occurred - {e}")
|
||||
|
||||
|
||||
async def make_async_calls():
|
||||
async def make_async_calls(metadata=None, **completion_kwargs):
|
||||
tasks = []
|
||||
for _ in range(5):
|
||||
task = asyncio.create_task(
|
||||
litellm.acompletion(
|
||||
model="azure/chatgpt-v-2",
|
||||
messages=[{"role": "user", "content": "This is a test"}],
|
||||
max_tokens=5,
|
||||
temperature=0.7,
|
||||
timeout=5,
|
||||
user="langfuse_latency_test_user",
|
||||
mock_response="It's simple to use and easy to get started",
|
||||
)
|
||||
)
|
||||
tasks.append(task)
|
||||
tasks.append(create_async_task())
|
||||
|
||||
# Measure the start time before running the tasks
|
||||
start_time = asyncio.get_event_loop().time()
|
||||
|
@ -161,9 +167,30 @@ async def make_async_calls():
|
|||
return total_time
|
||||
|
||||
|
||||
def create_async_task(**completion_kwargs):
|
||||
"""
|
||||
Creates an async task for the litellm.acompletion function.
|
||||
This is just the task, but it is not run here.
|
||||
To run the task it must be awaited or used in other asyncio coroutine execution functions like asyncio.gather.
|
||||
Any kwargs passed to this function will be passed to the litellm.acompletion function.
|
||||
By default a standard set of arguments are used for the litellm.acompletion function.
|
||||
"""
|
||||
completion_args = {
|
||||
"model": "azure/chatgpt-v-2",
|
||||
"messages": [{"role": "user", "content": "This is a test"}],
|
||||
"max_tokens": 5,
|
||||
"temperature": 0.7,
|
||||
"timeout": 5,
|
||||
"user": "langfuse_latency_test_user",
|
||||
"mock_response": "It's simple to use and easy to get started",
|
||||
}
|
||||
completion_args.update(completion_kwargs)
|
||||
return asyncio.create_task(litellm.acompletion(**completion_args))
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("stream", [False, True])
|
||||
async def test_langfuse_logging_without_request_response(stream):
|
||||
async def test_langfuse_logging_without_request_response(stream, langfuse_client):
|
||||
try:
|
||||
import uuid
|
||||
|
||||
|
@ -171,12 +198,8 @@ async def test_langfuse_logging_without_request_response(stream):
|
|||
litellm.set_verbose = True
|
||||
litellm.turn_off_message_logging = True
|
||||
litellm.success_callback = ["langfuse"]
|
||||
response = await litellm.acompletion(
|
||||
response = await create_async_task(
|
||||
model="gpt-3.5-turbo",
|
||||
mock_response="It's simple to use and easy to get started",
|
||||
messages=[{"role": "user", "content": "Hi 👋 - i'm claude"}],
|
||||
max_tokens=10,
|
||||
temperature=0.2,
|
||||
stream=stream,
|
||||
metadata={"trace_id": _unique_trace_name},
|
||||
)
|
||||
|
@ -185,14 +208,8 @@ async def test_langfuse_logging_without_request_response(stream):
|
|||
async for chunk in response:
|
||||
print(chunk)
|
||||
|
||||
await asyncio.sleep(3)
|
||||
|
||||
import langfuse
|
||||
|
||||
langfuse_client = langfuse.Langfuse(
|
||||
public_key=os.environ["LANGFUSE_PUBLIC_KEY"],
|
||||
secret_key=os.environ["LANGFUSE_SECRET_KEY"],
|
||||
)
|
||||
langfuse_client.flush()
|
||||
await asyncio.sleep(2)
|
||||
|
||||
# get trace with _unique_trace_name
|
||||
trace = langfuse_client.get_generations(trace_id=_unique_trace_name)
|
||||
|
@ -211,6 +228,123 @@ async def test_langfuse_logging_without_request_response(stream):
|
|||
pytest.fail(f"An exception occurred - {e}")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_langfuse_logging_metadata(langfuse_client):
|
||||
"""
|
||||
Test that creates multiple traces, with a varying number of generations and sets various metadata fields
|
||||
Confirms that no metadata that is standard within Langfuse is duplicated in the respective trace or generation metadata
|
||||
For trace continuation certain metadata of the trace is overriden with metadata from the last generation based on the update_trace_keys field
|
||||
Version is set for both the trace and the generation
|
||||
Release is just set for the trace
|
||||
Tags is just set for the trace
|
||||
"""
|
||||
import uuid
|
||||
|
||||
litellm.set_verbose = True
|
||||
litellm.success_callback = ["langfuse"]
|
||||
|
||||
trace_identifiers = {}
|
||||
expected_filtered_metadata_keys = {
|
||||
"trace_name",
|
||||
"trace_id",
|
||||
"existing_trace_id",
|
||||
"trace_user_id",
|
||||
"session_id",
|
||||
"tags",
|
||||
"generation_name",
|
||||
"generation_id",
|
||||
"prompt",
|
||||
}
|
||||
trace_metadata = {
|
||||
"trace_actual_metadata_key": "trace_actual_metadata_value"
|
||||
} # Allows for setting the metadata on the trace
|
||||
run_id = str(uuid.uuid4())
|
||||
session_id = f"litellm-test-session-{run_id}"
|
||||
trace_common_metadata = {
|
||||
"session_id": session_id,
|
||||
"tags": ["litellm-test-tag1", "litellm-test-tag2"],
|
||||
"update_trace_keys": [
|
||||
"output",
|
||||
"trace_metadata",
|
||||
], # Overwrite the following fields in the trace with the last generation's output and the trace_user_id
|
||||
"trace_metadata": trace_metadata,
|
||||
"gen_metadata_key": "gen_metadata_value", # Metadata key that should not be filtered in the generation
|
||||
"trace_release": "litellm-test-release",
|
||||
"version": "litellm-test-version",
|
||||
}
|
||||
for trace_num in range(1, 3): # Two traces
|
||||
metadata = copy.deepcopy(trace_common_metadata)
|
||||
trace_id = f"litellm-test-trace{trace_num}-{run_id}"
|
||||
metadata["trace_id"] = trace_id
|
||||
metadata["trace_name"] = trace_id
|
||||
trace_identifiers[trace_id] = []
|
||||
print(f"Trace: {trace_id}")
|
||||
for generation_num in range(
|
||||
1, trace_num + 1
|
||||
): # Each trace has a number of generations equal to its trace number
|
||||
metadata["trace_user_id"] = f"litellm-test-user{generation_num}-{run_id}"
|
||||
generation_id = (
|
||||
f"litellm-test-trace{trace_num}-generation-{generation_num}-{run_id}"
|
||||
)
|
||||
metadata["generation_id"] = generation_id
|
||||
metadata["generation_name"] = generation_id
|
||||
metadata["trace_metadata"][
|
||||
"generation_id"
|
||||
] = generation_id # Update to test if trace_metadata is overwritten by update trace keys
|
||||
trace_identifiers[trace_id].append(generation_id)
|
||||
print(f"Generation: {generation_id}")
|
||||
response = await create_async_task(
|
||||
model="gpt-3.5-turbo",
|
||||
mock_response=f"{session_id}:{trace_id}:{generation_id}",
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": f"{session_id}:{trace_id}:{generation_id}",
|
||||
}
|
||||
],
|
||||
max_tokens=100,
|
||||
temperature=0.2,
|
||||
metadata=copy.deepcopy(
|
||||
metadata
|
||||
), # Every generation needs its own metadata, langfuse is not async/thread safe without it
|
||||
)
|
||||
print(response)
|
||||
metadata["existing_trace_id"] = trace_id
|
||||
|
||||
langfuse_client.flush()
|
||||
await asyncio.sleep(2)
|
||||
|
||||
# Tests the metadata filtering and the override of the output to be the last generation
|
||||
for trace_id, generation_ids in trace_identifiers.items():
|
||||
trace = langfuse_client.get_trace(id=trace_id)
|
||||
assert trace.id == trace_id
|
||||
assert trace.session_id == session_id
|
||||
assert trace.metadata != trace_metadata
|
||||
generations = list(
|
||||
reversed(langfuse_client.get_generations(trace_id=trace_id).data)
|
||||
)
|
||||
assert len(generations) == len(generation_ids)
|
||||
assert (
|
||||
trace.input == generations[0].input
|
||||
) # Should be set by the first generation
|
||||
assert (
|
||||
trace.output == generations[-1].output
|
||||
) # Should be overwritten by the last generation according to update_trace_keys
|
||||
assert (
|
||||
trace.metadata != generations[-1].metadata
|
||||
) # Should be overwritten by the last generation according to update_trace_keys
|
||||
assert trace.metadata["generation_id"] == generations[-1].id
|
||||
assert set(trace.tags).issuperset(trace_common_metadata["tags"])
|
||||
print("trace_from_langfuse", trace)
|
||||
for generation_id, generation in zip(generation_ids, generations):
|
||||
assert generation.id == generation_id
|
||||
assert generation.trace_id == trace_id
|
||||
assert set(generation.metadata.keys()).isdisjoint(
|
||||
expected_filtered_metadata_keys
|
||||
)
|
||||
print("generation_from_langfuse", generation)
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="beta test - checking langfuse output")
|
||||
def test_langfuse_logging():
|
||||
try:
|
||||
|
@ -570,6 +704,10 @@ def test_langfuse_existing_trace_id():
|
|||
assert initial_langfuse_trace_dict == new_langfuse_trace_dict
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
condition=not os.environ.get("OPENAI_API_KEY", False),
|
||||
reason="Authentication missing for openai",
|
||||
)
|
||||
def test_langfuse_logging_tool_calling():
|
||||
litellm.set_verbose = True
|
||||
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
# What is this?
|
||||
## Tests slack alerting on proxy logging object
|
||||
|
||||
import sys
|
||||
import sys, json
|
||||
import os
|
||||
import io, asyncio
|
||||
from datetime import datetime, timedelta
|
||||
|
@ -10,14 +10,18 @@ from datetime import datetime, timedelta
|
|||
# logging.basicConfig(level=logging.DEBUG)
|
||||
sys.path.insert(0, os.path.abspath("../.."))
|
||||
from litellm.proxy.utils import ProxyLogging
|
||||
from litellm.caching import DualCache
|
||||
from litellm.caching import DualCache, RedisCache
|
||||
import litellm
|
||||
import pytest
|
||||
import asyncio
|
||||
from unittest.mock import patch, MagicMock
|
||||
from litellm.utils import get_api_base
|
||||
from litellm.caching import DualCache
|
||||
from litellm.integrations.slack_alerting import SlackAlerting
|
||||
from litellm.integrations.slack_alerting import SlackAlerting, DeploymentMetrics
|
||||
import unittest.mock
|
||||
from unittest.mock import AsyncMock
|
||||
import pytest
|
||||
from litellm.router import AlertingConfig, Router
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
|
@ -61,7 +65,7 @@ async def test_get_api_base():
|
|||
end_time = datetime.now()
|
||||
|
||||
time_difference_float, model, api_base, messages = (
|
||||
_pl.slack_alerting_instance._response_taking_too_long_callback(
|
||||
_pl.slack_alerting_instance._response_taking_too_long_callback_helper(
|
||||
kwargs={
|
||||
"model": model,
|
||||
"messages": messages,
|
||||
|
@ -98,7 +102,10 @@ def mock_env(monkeypatch):
|
|||
# Test the __init__ method
|
||||
def test_init():
|
||||
slack_alerting = SlackAlerting(
|
||||
alerting_threshold=32, alerting=["slack"], alert_types=["llm_exceptions"]
|
||||
alerting_threshold=32,
|
||||
alerting=["slack"],
|
||||
alert_types=["llm_exceptions"],
|
||||
internal_usage_cache=DualCache(),
|
||||
)
|
||||
assert slack_alerting.alerting_threshold == 32
|
||||
assert slack_alerting.alerting == ["slack"]
|
||||
|
@ -116,7 +123,7 @@ from datetime import datetime, timedelta
|
|||
|
||||
@pytest.fixture
|
||||
def slack_alerting():
|
||||
return SlackAlerting(alerting_threshold=1)
|
||||
return SlackAlerting(alerting_threshold=1, internal_usage_cache=DualCache())
|
||||
|
||||
|
||||
# Test for hanging LLM responses
|
||||
|
@ -185,3 +192,170 @@ async def test_send_alert(slack_alerting):
|
|||
mock_post.return_value.status_code = 200
|
||||
await slack_alerting.send_alert("Test message", "Low", "budget_alerts")
|
||||
mock_post.assert_awaited_once()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_daily_reports_unit_test(slack_alerting):
|
||||
with patch.object(slack_alerting, "send_alert", new=AsyncMock()) as mock_send_alert:
|
||||
router = litellm.Router(
|
||||
model_list=[
|
||||
{
|
||||
"model_name": "test-gpt",
|
||||
"litellm_params": {"model": "gpt-3.5-turbo"},
|
||||
"model_info": {"id": "1234"},
|
||||
}
|
||||
]
|
||||
)
|
||||
deployment_metrics = DeploymentMetrics(
|
||||
id="1234",
|
||||
failed_request=False,
|
||||
latency_per_output_token=20.3,
|
||||
updated_at=litellm.utils.get_utc_datetime(),
|
||||
)
|
||||
|
||||
updated_val = await slack_alerting.async_update_daily_reports(
|
||||
deployment_metrics=deployment_metrics
|
||||
)
|
||||
|
||||
assert updated_val == 1
|
||||
|
||||
await slack_alerting.send_daily_reports(router=router)
|
||||
|
||||
mock_send_alert.assert_awaited_once()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_daily_reports_completion(slack_alerting):
|
||||
with patch.object(slack_alerting, "send_alert", new=AsyncMock()) as mock_send_alert:
|
||||
litellm.callbacks = [slack_alerting]
|
||||
|
||||
# on async success
|
||||
router = litellm.Router(
|
||||
model_list=[
|
||||
{
|
||||
"model_name": "gpt-5",
|
||||
"litellm_params": {
|
||||
"model": "gpt-3.5-turbo",
|
||||
},
|
||||
}
|
||||
]
|
||||
)
|
||||
|
||||
await router.acompletion(
|
||||
model="gpt-3.5-turbo",
|
||||
messages=[{"role": "user", "content": "Hey, how's it going?"}],
|
||||
)
|
||||
|
||||
await asyncio.sleep(3)
|
||||
response_val = await slack_alerting.send_daily_reports(router=router)
|
||||
|
||||
assert response_val == True
|
||||
|
||||
mock_send_alert.assert_awaited_once()
|
||||
|
||||
# on async failure
|
||||
router = litellm.Router(
|
||||
model_list=[
|
||||
{
|
||||
"model_name": "gpt-5",
|
||||
"litellm_params": {"model": "gpt-3.5-turbo", "api_key": "bad_key"},
|
||||
}
|
||||
]
|
||||
)
|
||||
|
||||
try:
|
||||
await router.acompletion(
|
||||
model="gpt-3.5-turbo",
|
||||
messages=[{"role": "user", "content": "Hey, how's it going?"}],
|
||||
)
|
||||
except Exception as e:
|
||||
pass
|
||||
|
||||
await asyncio.sleep(3)
|
||||
response_val = await slack_alerting.send_daily_reports(router=router)
|
||||
|
||||
assert response_val == True
|
||||
|
||||
mock_send_alert.assert_awaited()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_daily_reports_redis_cache_scheduler():
|
||||
redis_cache = RedisCache()
|
||||
slack_alerting = SlackAlerting(
|
||||
internal_usage_cache=DualCache(redis_cache=redis_cache)
|
||||
)
|
||||
router = litellm.Router(
|
||||
model_list=[
|
||||
{
|
||||
"model_name": "gpt-5",
|
||||
"litellm_params": {
|
||||
"model": "gpt-3.5-turbo",
|
||||
},
|
||||
}
|
||||
]
|
||||
)
|
||||
|
||||
with patch.object(
|
||||
slack_alerting, "send_alert", new=AsyncMock()
|
||||
) as mock_send_alert, patch.object(
|
||||
redis_cache, "async_set_cache", new=AsyncMock()
|
||||
) as mock_redis_set_cache:
|
||||
# initial call - expect empty
|
||||
await slack_alerting._run_scheduler_helper(llm_router=router)
|
||||
|
||||
try:
|
||||
json.dumps(mock_redis_set_cache.call_args[0][1])
|
||||
except Exception as e:
|
||||
pytest.fail(
|
||||
"Cache value can't be json dumped - {}".format(
|
||||
mock_redis_set_cache.call_args[0][1]
|
||||
)
|
||||
)
|
||||
|
||||
mock_redis_set_cache.assert_awaited_once()
|
||||
|
||||
# second call - expect empty
|
||||
await slack_alerting._run_scheduler_helper(llm_router=router)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.skip(reason="Local test. Test if slack alerts are sent.")
|
||||
async def test_send_llm_exception_to_slack():
|
||||
from litellm.router import AlertingConfig
|
||||
|
||||
# on async success
|
||||
router = litellm.Router(
|
||||
model_list=[
|
||||
{
|
||||
"model_name": "gpt-3.5-turbo",
|
||||
"litellm_params": {
|
||||
"model": "gpt-3.5-turbo",
|
||||
"api_key": "bad_key",
|
||||
},
|
||||
},
|
||||
{
|
||||
"model_name": "gpt-5-good",
|
||||
"litellm_params": {
|
||||
"model": "gpt-3.5-turbo",
|
||||
},
|
||||
},
|
||||
],
|
||||
alerting_config=AlertingConfig(
|
||||
alerting_threshold=0.5, webhook_url=os.getenv("SLACK_WEBHOOK_URL")
|
||||
),
|
||||
)
|
||||
try:
|
||||
await router.acompletion(
|
||||
model="gpt-3.5-turbo",
|
||||
messages=[{"role": "user", "content": "Hey, how's it going?"}],
|
||||
)
|
||||
except:
|
||||
pass
|
||||
|
||||
await router.acompletion(
|
||||
model="gpt-5-good",
|
||||
messages=[{"role": "user", "content": "Hey, how's it going?"}],
|
||||
)
|
||||
|
||||
await asyncio.sleep(3)
|
||||
|
|
|
@ -118,6 +118,7 @@ def test_completion_claude():
|
|||
|
||||
def test_completion_claude_3_empty_response():
|
||||
litellm.set_verbose = True
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
|
@ -2167,9 +2168,9 @@ def test_completion_replicate_vicuna():
|
|||
|
||||
def test_replicate_custom_prompt_dict():
|
||||
litellm.set_verbose = True
|
||||
model_name = "replicate/meta/llama-2-70b-chat"
|
||||
model_name = "replicate/meta/llama-2-7b"
|
||||
litellm.register_prompt_template(
|
||||
model="replicate/meta/llama-2-70b-chat",
|
||||
model="replicate/meta/llama-2-7b",
|
||||
initial_prompt_value="You are a good assistant", # [OPTIONAL]
|
||||
roles={
|
||||
"system": {
|
||||
|
@ -2199,6 +2200,7 @@ def test_replicate_custom_prompt_dict():
|
|||
repetition_penalty=0.1,
|
||||
num_retries=3,
|
||||
)
|
||||
|
||||
except litellm.APIError as e:
|
||||
pass
|
||||
except litellm.APIConnectionError as e:
|
||||
|
@ -3016,6 +3018,21 @@ async def test_acompletion_gemini():
|
|||
pytest.fail(f"Error occurred: {e}")
|
||||
|
||||
|
||||
# Deepseek tests
|
||||
def test_completion_deepseek():
|
||||
litellm.set_verbose = True
|
||||
model_name = "deepseek/deepseek-chat"
|
||||
messages = [{"role": "user", "content": "Hey, how's it going?"}]
|
||||
try:
|
||||
response = completion(model=model_name, messages=messages)
|
||||
# Add any assertions here to check the response
|
||||
print(response)
|
||||
except litellm.APIError as e:
|
||||
pass
|
||||
except Exception as e:
|
||||
pytest.fail(f"Error occurred: {e}")
|
||||
|
||||
|
||||
# Palm tests
|
||||
def test_completion_palm():
|
||||
litellm.set_verbose = True
|
||||
|
|
|
@ -231,14 +231,17 @@ def test_cost_bedrock_pricing():
|
|||
assert cost == predicted_cost
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="AWS disabled our access")
|
||||
def test_cost_bedrock_pricing_actual_calls():
|
||||
litellm.set_verbose = True
|
||||
model = "anthropic.claude-instant-v1"
|
||||
messages = [{"role": "user", "content": "Hey, how's it going?"}]
|
||||
response = litellm.completion(model=model, messages=messages)
|
||||
assert response._hidden_params["region_name"] is not None
|
||||
response = litellm.completion(
|
||||
model=model, messages=messages, mock_response="hello cool one"
|
||||
)
|
||||
|
||||
print("response", response)
|
||||
cost = litellm.completion_cost(
|
||||
model="bedrock/anthropic.claude-instant-v1",
|
||||
completion_response=response,
|
||||
messages=[{"role": "user", "content": "Hey, how's it going?"}],
|
||||
)
|
||||
|
|
|
@ -140,6 +140,8 @@ async def test_add_existing_deployment():
|
|||
deployment_2.to_json(exclude_none=True),
|
||||
]
|
||||
)
|
||||
|
||||
init_len_list = len(llm_router.model_list)
|
||||
print(f"llm_router: {llm_router}")
|
||||
master_key = "sk-1234"
|
||||
setattr(litellm.proxy.proxy_server, "llm_router", llm_router)
|
||||
|
@ -164,7 +166,7 @@ async def test_add_existing_deployment():
|
|||
db_models = [db_model]
|
||||
num_added = pc._add_deployment(db_models=db_models)
|
||||
|
||||
assert num_added == 0
|
||||
assert init_len_list == len(llm_router.model_list)
|
||||
|
||||
|
||||
litellm_params = LiteLLM_Params(
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue