added changes from upstream

Merge branch 'main' into fix/error-on-get-user-role
This commit is contained in:
Nick Wong 2024-05-09 16:14:14 -07:00
commit d3a228d03b
No known key found for this signature in database
GPG key ID: F97B88DE019A52E9
142 changed files with 4439 additions and 801 deletions

View file

@ -188,7 +188,7 @@ jobs:
command: | command: |
docker run -d \ docker run -d \
-p 4000:4000 \ -p 4000:4000 \
-e DATABASE_URL=$PROXY_DOCKER_DB_URL \ -e DATABASE_URL=$PROXY_DATABASE_URL \
-e AZURE_API_KEY=$AZURE_API_KEY \ -e AZURE_API_KEY=$AZURE_API_KEY \
-e REDIS_HOST=$REDIS_HOST \ -e REDIS_HOST=$REDIS_HOST \
-e REDIS_PASSWORD=$REDIS_PASSWORD \ -e REDIS_PASSWORD=$REDIS_PASSWORD \
@ -223,7 +223,7 @@ jobs:
background: true background: true
- run: - run:
name: Wait for app to be ready name: Wait for app to be ready
command: dockerize -wait http://localhost:4000 -timeout 1m command: dockerize -wait http://localhost:4000 -timeout 5m
- run: - run:
name: Run tests name: Run tests
command: | command: |

View file

@ -0,0 +1,51 @@
{
"name": "Python 3.11",
// Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile
"image": "mcr.microsoft.com/devcontainers/python:3.11-bookworm",
// https://github.com/devcontainers/images/tree/main/src/python
// https://mcr.microsoft.com/en-us/product/devcontainers/python/tags
// "build": {
// "dockerfile": "Dockerfile",
// "context": ".."
// },
// Features to add to the dev container. More info: https://containers.dev/features.
// "features": {},
// Configure tool-specific properties.
"customizations": {
// Configure properties specific to VS Code.
"vscode": {
"settings": {},
"extensions": [
"ms-python.python",
"ms-python.vscode-pylance",
"GitHub.copilot",
"GitHub.copilot-chat"
]
}
},
// Use 'forwardPorts' to make a list of ports inside the container available locally.
"forwardPorts": [4000],
"containerEnv": {
"LITELLM_LOG": "DEBUG"
},
// Use 'portsAttributes' to set default properties for specific forwarded ports.
// More info: https://containers.dev/implementors/json_reference/#port-attributes
"portsAttributes": {
"4000": {
"label": "LiteLLM Server",
"onAutoForward": "notify"
}
},
// More info: https://aka.ms/dev-containers-non-root.
// "remoteUser": "litellm",
// Use 'postCreateCommand' to run commands after the container is created.
"postCreateCommand": "pipx install poetry && poetry install -E extra_proxy -E proxy"
}

View file

@ -64,6 +64,11 @@ if __name__ == "__main__":
) # Replace with your repository's username and name ) # Replace with your repository's username and name
latest_release = repo.get_latest_release() latest_release = repo.get_latest_release()
print("got latest release: ", latest_release) print("got latest release: ", latest_release)
print(latest_release.title)
print(latest_release.tag_name)
release_version = latest_release.title
print("latest release body: ", latest_release.body) print("latest release body: ", latest_release.body)
print("markdown table: ", markdown_table) print("markdown table: ", markdown_table)
@ -74,8 +79,22 @@ if __name__ == "__main__":
start_index = latest_release.body.find("Load Test LiteLLM Proxy Results") start_index = latest_release.body.find("Load Test LiteLLM Proxy Results")
existing_release_body = latest_release.body[:start_index] existing_release_body = latest_release.body[:start_index]
docker_run_command = f"""
\n\n
## Docker Run LiteLLM Proxy
```
docker run \\
-e STORE_MODEL_IN_DB=True \\
-p 4000:4000 \\
ghcr.io/berriai/litellm:main-{release_version}
```
"""
print("docker run command: ", docker_run_command)
new_release_body = ( new_release_body = (
existing_release_body existing_release_body
+ docker_run_command
+ "\n\n" + "\n\n"
+ "### Don't want to maintain your internal proxy? get in touch 🎉" + "### Don't want to maintain your internal proxy? get in touch 🎉"
+ "\nHosted Proxy Alpha: https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat" + "\nHosted Proxy Alpha: https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat"

4
.gitignore vendored
View file

@ -1,5 +1,6 @@
.venv .venv
.env .env
litellm/proxy/myenv/*
litellm_uuid.txt litellm_uuid.txt
__pycache__/ __pycache__/
*.pyc *.pyc
@ -52,3 +53,6 @@ litellm/proxy/_new_secret_config.yaml
litellm/proxy/_new_secret_config.yaml litellm/proxy/_new_secret_config.yaml
litellm/proxy/_super_secret_config.yaml litellm/proxy/_super_secret_config.yaml
litellm/proxy/_super_secret_config.yaml litellm/proxy/_super_secret_config.yaml
litellm/proxy/myenv/bin/activate
litellm/proxy/myenv/bin/Activate.ps1
myenv/*

View file

@ -16,11 +16,11 @@ repos:
name: Check if files match name: Check if files match
entry: python3 ci_cd/check_files_match.py entry: python3 ci_cd/check_files_match.py
language: system language: system
- repo: local # - repo: local
hooks: # hooks:
- id: mypy # - id: mypy
name: mypy # name: mypy
entry: python3 -m mypy --ignore-missing-imports # entry: python3 -m mypy --ignore-missing-imports
language: system # language: system
types: [python] # types: [python]
files: ^litellm/ # files: ^litellm/

View file

@ -226,6 +226,7 @@ curl 'http://0.0.0.0:4000/key/generate' \
| [deepinfra](https://docs.litellm.ai/docs/providers/deepinfra) | ✅ | ✅ | ✅ | ✅ | | [deepinfra](https://docs.litellm.ai/docs/providers/deepinfra) | ✅ | ✅ | ✅ | ✅ |
| [perplexity-ai](https://docs.litellm.ai/docs/providers/perplexity) | ✅ | ✅ | ✅ | ✅ | | [perplexity-ai](https://docs.litellm.ai/docs/providers/perplexity) | ✅ | ✅ | ✅ | ✅ |
| [Groq AI](https://docs.litellm.ai/docs/providers/groq) | ✅ | ✅ | ✅ | ✅ | | [Groq AI](https://docs.litellm.ai/docs/providers/groq) | ✅ | ✅ | ✅ | ✅ |
| [Deepseek](https://docs.litellm.ai/docs/providers/deepseek) | ✅ | ✅ | ✅ | ✅ |
| [anyscale](https://docs.litellm.ai/docs/providers/anyscale) | ✅ | ✅ | ✅ | ✅ | | [anyscale](https://docs.litellm.ai/docs/providers/anyscale) | ✅ | ✅ | ✅ | ✅ |
| [IBM - watsonx.ai](https://docs.litellm.ai/docs/providers/watsonx) | ✅ | ✅ | ✅ | ✅ | ✅ | [IBM - watsonx.ai](https://docs.litellm.ai/docs/providers/watsonx) | ✅ | ✅ | ✅ | ✅ | ✅
| [voyage ai](https://docs.litellm.ai/docs/providers/voyage) | | | | | ✅ | | [voyage ai](https://docs.litellm.ai/docs/providers/voyage) | | | | | ✅ |

Binary file not shown.

View file

@ -0,0 +1,15 @@
{
"$schema": "https://schema.management.azure.com/schemas/0.1.2-preview/CreateUIDefinition.MultiVm.json#",
"handler": "Microsoft.Azure.CreateUIDef",
"version": "0.1.2-preview",
"parameters": {
"config": {
"isWizard": false,
"basics": { }
},
"basics": [ ],
"steps": [ ],
"outputs": { },
"resourceTypes": [ ]
}
}

View file

@ -0,0 +1,63 @@
{
"$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#",
"contentVersion": "1.0.0.0",
"parameters": {
"imageName": {
"type": "string",
"defaultValue": "ghcr.io/berriai/litellm:main-latest"
},
"containerName": {
"type": "string",
"defaultValue": "litellm-container"
},
"dnsLabelName": {
"type": "string",
"defaultValue": "litellm"
},
"portNumber": {
"type": "int",
"defaultValue": 4000
}
},
"resources": [
{
"type": "Microsoft.ContainerInstance/containerGroups",
"apiVersion": "2021-03-01",
"name": "[parameters('containerName')]",
"location": "[resourceGroup().location]",
"properties": {
"containers": [
{
"name": "[parameters('containerName')]",
"properties": {
"image": "[parameters('imageName')]",
"resources": {
"requests": {
"cpu": 1,
"memoryInGB": 2
}
},
"ports": [
{
"port": "[parameters('portNumber')]"
}
]
}
}
],
"osType": "Linux",
"restartPolicy": "Always",
"ipAddress": {
"type": "Public",
"ports": [
{
"protocol": "tcp",
"port": "[parameters('portNumber')]"
}
],
"dnsNameLabel": "[parameters('dnsLabelName')]"
}
}
}
]
}

View file

@ -0,0 +1,42 @@
param imageName string = 'ghcr.io/berriai/litellm:main-latest'
param containerName string = 'litellm-container'
param dnsLabelName string = 'litellm'
param portNumber int = 4000
resource containerGroupName 'Microsoft.ContainerInstance/containerGroups@2021-03-01' = {
name: containerName
location: resourceGroup().location
properties: {
containers: [
{
name: containerName
properties: {
image: imageName
resources: {
requests: {
cpu: 1
memoryInGB: 2
}
}
ports: [
{
port: portNumber
}
]
}
}
]
osType: 'Linux'
restartPolicy: 'Always'
ipAddress: {
type: 'Public'
ports: [
{
protocol: 'tcp'
port: portNumber
}
]
dnsNameLabel: dnsLabelName
}
}
}

View file

@ -83,6 +83,7 @@ def completion(
top_p: Optional[float] = None, top_p: Optional[float] = None,
n: Optional[int] = None, n: Optional[int] = None,
stream: Optional[bool] = None, stream: Optional[bool] = None,
stream_options: Optional[dict] = None,
stop=None, stop=None,
max_tokens: Optional[int] = None, max_tokens: Optional[int] = None,
presence_penalty: Optional[float] = None, presence_penalty: Optional[float] = None,
@ -139,6 +140,10 @@ def completion(
- `stream`: *boolean or null (optional)* - If set to true, it sends partial message deltas. Tokens will be sent as they become available, with the stream terminated by a [DONE] message. - `stream`: *boolean or null (optional)* - If set to true, it sends partial message deltas. Tokens will be sent as they become available, with the stream terminated by a [DONE] message.
- `stream_options` *dict or null (optional)* - Options for streaming response. Only set this when you set `stream: true`
- `include_usage` *boolean (optional)* - If set, an additional chunk will be streamed before the data: [DONE] message. The usage field on this chunk shows the token usage statistics for the entire request, and the choices field will always be an empty array. All other chunks will also include a usage field, but with a null value.
- `stop`: *string/ array/ null (optional)* - Up to 4 sequences where the API will stop generating further tokens. - `stop`: *string/ array/ null (optional)* - Up to 4 sequences where the API will stop generating further tokens.
- `max_tokens`: *integer (optional)* - The maximum number of tokens to generate in the chat completion. - `max_tokens`: *integer (optional)* - The maximum number of tokens to generate in the chat completion.

View file

@ -47,3 +47,12 @@ Pricing is based on usage. We can figure out a price that works for your team, o
<Image img={require('../img/litellm_hosted_ui_router.png')} /> <Image img={require('../img/litellm_hosted_ui_router.png')} />
#### [**🚨 Schedule Call**](https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat) #### [**🚨 Schedule Call**](https://calendly.com/d/4mp-gd3-k5k/litellm-1-1-onboarding-chat)
## Feature List
- Easy way to add/remove models
- 100% uptime even when models are added/removed
- custom callback webhooks
- your domain name with HTTPS
- Ability to create/delete User API keys
- Reasonable set monthly cost

View file

@ -14,14 +14,14 @@ import TabItem from '@theme/TabItem';
```python ```python
import os import os
from langchain.chat_models import ChatLiteLLM from langchain_community.chat_models import ChatLiteLLM
from langchain.prompts.chat import ( from langchain_core.prompts import (
ChatPromptTemplate, ChatPromptTemplate,
SystemMessagePromptTemplate, SystemMessagePromptTemplate,
AIMessagePromptTemplate, AIMessagePromptTemplate,
HumanMessagePromptTemplate, HumanMessagePromptTemplate,
) )
from langchain.schema import AIMessage, HumanMessage, SystemMessage from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
os.environ['OPENAI_API_KEY'] = "" os.environ['OPENAI_API_KEY'] = ""
chat = ChatLiteLLM(model="gpt-3.5-turbo") chat = ChatLiteLLM(model="gpt-3.5-turbo")
@ -30,7 +30,7 @@ messages = [
content="what model are you" content="what model are you"
) )
] ]
chat(messages) chat.invoke(messages)
``` ```
</TabItem> </TabItem>
@ -39,14 +39,14 @@ chat(messages)
```python ```python
import os import os
from langchain.chat_models import ChatLiteLLM from langchain_community.chat_models import ChatLiteLLM
from langchain.prompts.chat import ( from langchain_core.prompts import (
ChatPromptTemplate, ChatPromptTemplate,
SystemMessagePromptTemplate, SystemMessagePromptTemplate,
AIMessagePromptTemplate, AIMessagePromptTemplate,
HumanMessagePromptTemplate, HumanMessagePromptTemplate,
) )
from langchain.schema import AIMessage, HumanMessage, SystemMessage from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
os.environ['ANTHROPIC_API_KEY'] = "" os.environ['ANTHROPIC_API_KEY'] = ""
chat = ChatLiteLLM(model="claude-2", temperature=0.3) chat = ChatLiteLLM(model="claude-2", temperature=0.3)
@ -55,7 +55,7 @@ messages = [
content="what model are you" content="what model are you"
) )
] ]
chat(messages) chat.invoke(messages)
``` ```
</TabItem> </TabItem>
@ -64,14 +64,14 @@ chat(messages)
```python ```python
import os import os
from langchain.chat_models import ChatLiteLLM from langchain_community.chat_models import ChatLiteLLM
from langchain.prompts.chat import ( from langchain_core.prompts.chat import (
ChatPromptTemplate, ChatPromptTemplate,
SystemMessagePromptTemplate, SystemMessagePromptTemplate,
AIMessagePromptTemplate, AIMessagePromptTemplate,
HumanMessagePromptTemplate, HumanMessagePromptTemplate,
) )
from langchain.schema import AIMessage, HumanMessage, SystemMessage from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
os.environ['REPLICATE_API_TOKEN'] = "" os.environ['REPLICATE_API_TOKEN'] = ""
chat = ChatLiteLLM(model="replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1") chat = ChatLiteLLM(model="replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1")
@ -80,7 +80,7 @@ messages = [
content="what model are you?" content="what model are you?"
) )
] ]
chat(messages) chat.invoke(messages)
``` ```
</TabItem> </TabItem>
@ -89,14 +89,14 @@ chat(messages)
```python ```python
import os import os
from langchain.chat_models import ChatLiteLLM from langchain_community.chat_models import ChatLiteLLM
from langchain.prompts.chat import ( from langchain_core.prompts import (
ChatPromptTemplate, ChatPromptTemplate,
SystemMessagePromptTemplate, SystemMessagePromptTemplate,
AIMessagePromptTemplate, AIMessagePromptTemplate,
HumanMessagePromptTemplate, HumanMessagePromptTemplate,
) )
from langchain.schema import AIMessage, HumanMessage, SystemMessage from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
os.environ['COHERE_API_KEY'] = "" os.environ['COHERE_API_KEY'] = ""
chat = ChatLiteLLM(model="command-nightly") chat = ChatLiteLLM(model="command-nightly")
@ -105,32 +105,9 @@ messages = [
content="what model are you?" content="what model are you?"
) )
] ]
chat(messages) chat.invoke(messages)
``` ```
</TabItem>
<TabItem value="palm" label="PaLM - Google">
```python
import os
from langchain.chat_models import ChatLiteLLM
from langchain.prompts.chat import (
ChatPromptTemplate,
SystemMessagePromptTemplate,
AIMessagePromptTemplate,
HumanMessagePromptTemplate,
)
from langchain.schema import AIMessage, HumanMessage, SystemMessage
os.environ['PALM_API_KEY'] = ""
chat = ChatLiteLLM(model="palm/chat-bison")
messages = [
HumanMessage(
content="what model are you?"
)
]
chat(messages)
```
</TabItem> </TabItem>
</Tabs> </Tabs>

View file

@ -94,9 +94,10 @@ print(response)
``` ```
### Set Custom Trace ID, Trace User ID and Tags ### Set Custom Trace ID, Trace User ID, Trace Metadata, Trace Version, Trace Release and Tags
Pass `trace_id`, `trace_user_id`, `trace_metadata`, `trace_version`, `trace_release`, `tags` in `metadata`
Pass `trace_id`, `trace_user_id` in `metadata`
```python ```python
import litellm import litellm
@ -121,12 +122,20 @@ response = completion(
metadata={ metadata={
"generation_name": "ishaan-test-generation", # set langfuse Generation Name "generation_name": "ishaan-test-generation", # set langfuse Generation Name
"generation_id": "gen-id22", # set langfuse Generation ID "generation_id": "gen-id22", # set langfuse Generation ID
"version": "test-generation-version" # set langfuse Generation Version
"trace_user_id": "user-id2", # set langfuse Trace User ID "trace_user_id": "user-id2", # set langfuse Trace User ID
"session_id": "session-1", # set langfuse Session ID "session_id": "session-1", # set langfuse Session ID
"tags": ["tag1", "tag2"] # set langfuse Tags "tags": ["tag1", "tag2"], # set langfuse Tags
"trace_id": "trace-id22", # set langfuse Trace ID "trace_id": "trace-id22", # set langfuse Trace ID
"trace_metadata": {"key": "value"}, # set langfuse Trace Metadata
"trace_version": "test-trace-version", # set langfuse Trace Version (if not set, defaults to Generation Version)
"trace_release": "test-trace-release", # set langfuse Trace Release
### OR ### ### OR ###
"existing_trace_id": "trace-id22", # if generation is continuation of past trace. This prevents default behaviour of setting a trace name "existing_trace_id": "trace-id22", # if generation is continuation of past trace. This prevents default behaviour of setting a trace name
### OR enforce that certain fields are trace overwritten in the trace during the continuation ###
"existing_trace_id": "trace-id22",
"trace_metadata": {"key": "updated_trace_value"}, # The new value to use for the langfuse Trace Metadata
"update_trace_keys": ["input", "output", "trace_metadata"], # Updates the trace input & output to be this generations input & output also updates the Trace Metadata to match the passed in value
}, },
) )
@ -134,6 +143,38 @@ print(response)
``` ```
### Trace & Generation Parameters
#### Trace Specific Parameters
* `trace_id` - Identifier for the trace, must use `existing_trace_id` instead or in conjunction with `trace_id` if this is an existing trace, auto-generated by default
* `trace_name` - Name of the trace, auto-generated by default
* `session_id` - Session identifier for the trace, defaults to `None`
* `trace_version` - Version for the trace, defaults to value for `version`
* `trace_release` - Release for the trace, defaults to `None`
* `trace_metadata` - Metadata for the trace, defaults to `None`
* `trace_user_id` - User identifier for the trace, defaults to completion argument `user`
* `tags` - Tags for the trace, defeaults to `None`
##### Updatable Parameters on Continuation
The following parameters can be updated on a continuation of a trace by passing in the following values into the `update_trace_keys` in the metadata of the completion.
* `input` - Will set the traces input to be the input of this latest generation
* `output` - Will set the traces output to be the output of this generation
* `trace_version` - Will set the trace version to be the provided value (To use the latest generations version instead, use `version`)
* `trace_release` - Will set the trace release to be the provided value
* `trace_metadata` - Will set the trace metadata to the provided value
* `trace_user_id` - Will set the trace user id to the provided value
#### Generation Specific Parameters
* `generation_id` - Identifier for the generation, auto-generated by default
* `generation_name` - Identifier for the generation, auto-generated by default
* `prompt` - Langfuse prompt object used for the generation, defaults to None
Any other key value pairs passed into the metadata not listed in the above spec for a `litellm` completion will be added as a metadata key value pair for the generation.
### Use LangChain ChatLiteLLM + Langfuse ### Use LangChain ChatLiteLLM + Langfuse
Pass `trace_user_id`, `session_id` in model_kwargs Pass `trace_user_id`, `session_id` in model_kwargs
```python ```python

View file

@ -0,0 +1,54 @@
# Deepseek
https://deepseek.com/
**We support ALL Deepseek models, just set `deepseek/` as a prefix when sending completion requests**
## API Key
```python
# env variable
os.environ['DEEPSEEK_API_KEY']
```
## Sample Usage
```python
from litellm import completion
import os
os.environ['DEEPSEEK_API_KEY'] = ""
response = completion(
model="deepseek/deepseek-chat",
messages=[
{"role": "user", "content": "hello from litellm"}
],
)
print(response)
```
## Sample Usage - Streaming
```python
from litellm import completion
import os
os.environ['DEEPSEEK_API_KEY'] = ""
response = completion(
model="deepseek/deepseek-chat",
messages=[
{"role": "user", "content": "hello from litellm"}
],
stream=True
)
for chunk in response:
print(chunk)
```
## Supported Models - ALL Deepseek Models Supported!
We support ALL Deepseek models, just set `deepseek/` as a prefix when sending completion requests
| Model Name | Function Call |
|--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| deepseek-chat | `completion(model="deepseek/deepseek-chat", messages)` |
| deepseek-coder | `completion(model="deepseek/deepseek-chat", messages)` |

View file

@ -45,13 +45,13 @@ for chunk in response:
All models listed here https://docs.mistral.ai/platform/endpoints are supported. We actively maintain the list of models, pricing, token window, etc. [here](https://github.com/BerriAI/litellm/blob/c1b25538277206b9f00de5254d80d6a83bb19a29/model_prices_and_context_window.json). All models listed here https://docs.mistral.ai/platform/endpoints are supported. We actively maintain the list of models, pricing, token window, etc. [here](https://github.com/BerriAI/litellm/blob/c1b25538277206b9f00de5254d80d6a83bb19a29/model_prices_and_context_window.json).
| Model Name | Function Call | | Model Name | Function Call |
|--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------| |----------------|--------------------------------------------------------------|
| mistral-tiny | `completion(model="mistral/mistral-tiny", messages)` | | Mistral Small | `completion(model="mistral/mistral-small-latest", messages)` |
| mistral-small | `completion(model="mistral/mistral-small", messages)` | | Mistral Medium | `completion(model="mistral/mistral-medium-latest", messages)`|
| mistral-medium | `completion(model="mistral/mistral-medium", messages)` | | Mistral Large | `completion(model="mistral/mistral-large-latest", messages)` |
| mistral-large-latest | `completion(model="mistral/mistral-large-latest", messages)` | | Mistral 7B | `completion(model="mistral/open-mistral-7b", messages)` |
| open-mixtral-8x22b | `completion(model="mistral/open-mixtral-8x22b", messages)` | | Mixtral 8x7B | `completion(model="mistral/open-mixtral-8x7b", messages)` |
| Mixtral 8x22B | `completion(model="mistral/open-mixtral-8x22b", messages)` |
## Function Calling ## Function Calling
@ -116,6 +116,6 @@ All models listed here https://docs.mistral.ai/platform/endpoints are supported
| Model Name | Function Call | | Model Name | Function Call |
|--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------| |--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| mistral-embed | `embedding(model="mistral/mistral-embed", input)` | | Mistral Embeddings | `embedding(model="mistral/mistral-embed", input)` |

View file

@ -17,6 +17,7 @@ This is a new feature, and subject to changes based on feedback.
### Step 1. Setup Proxy ### Step 1. Setup Proxy
- `JWT_PUBLIC_KEY_URL`: This is the public keys endpoint of your OpenID provider. Typically it's `{openid-provider-base-url}/.well-known/openid-configuration/jwks`. For Keycloak it's `{keycloak_base_url}/realms/{your-realm}/protocol/openid-connect/certs`. - `JWT_PUBLIC_KEY_URL`: This is the public keys endpoint of your OpenID provider. Typically it's `{openid-provider-base-url}/.well-known/openid-configuration/jwks`. For Keycloak it's `{keycloak_base_url}/realms/{your-realm}/protocol/openid-connect/certs`.
- `JWT_AUDIENCE`: This is the audience used for decoding the JWT. If not set, the decode step will not verify the audience.
```bash ```bash
export JWT_PUBLIC_KEY_URL="" # "https://demo.duendesoftware.com/.well-known/openid-configuration/jwks" export JWT_PUBLIC_KEY_URL="" # "https://demo.duendesoftware.com/.well-known/openid-configuration/jwks"

View file

@ -12,8 +12,8 @@ Requirements:
You can set budgets at 3 levels: You can set budgets at 3 levels:
- For the proxy - For the proxy
- For a user - For an internal user
- For a 'user' passed to `/chat/completions`, `/embeddings` etc - For an end-user
- For a key - For a key
- For a key (model specific budgets) - For a key (model specific budgets)
@ -58,7 +58,7 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
}' }'
``` ```
</TabItem> </TabItem>
<TabItem value="per-user" label="For User"> <TabItem value="per-user" label="For Internal User">
Apply a budget across multiple keys. Apply a budget across multiple keys.
@ -165,12 +165,12 @@ curl --location 'http://localhost:4000/team/new' \
} }
``` ```
</TabItem> </TabItem>
<TabItem value="per-user-chat" label="For 'user' passed to /chat/completions"> <TabItem value="per-user-chat" label="For End User">
Use this to budget `user` passed to `/chat/completions`, **without needing to create a key for every user** Use this to budget `user` passed to `/chat/completions`, **without needing to create a key for every user**
**Step 1. Modify config.yaml** **Step 1. Modify config.yaml**
Define `litellm.max_user_budget` Define `litellm.max_end_user_budget`
```yaml ```yaml
general_settings: general_settings:
master_key: sk-1234 master_key: sk-1234
@ -328,7 +328,7 @@ You can set:
- max parallel requests - max parallel requests
<Tabs> <Tabs>
<TabItem value="per-user" label="Per User"> <TabItem value="per-user" label="Per Internal User">
Use `/user/new`, to persist rate limits across multiple keys. Use `/user/new`, to persist rate limits across multiple keys.
@ -408,7 +408,7 @@ curl --location 'http://localhost:4000/user/new' \
``` ```
## Create new keys for existing user ## Create new keys for existing internal user
Just include user_id in the `/key/generate` request. Just include user_id in the `/key/generate` request.

View file

@ -96,7 +96,7 @@ print(response)
- `router.aimage_generation()` - async image generation calls - `router.aimage_generation()` - async image generation calls
## Advanced - Routing Strategies ## Advanced - Routing Strategies
#### Routing Strategies - Weighted Pick, Rate Limit Aware, Least Busy, Latency Based #### Routing Strategies - Weighted Pick, Rate Limit Aware, Least Busy, Latency Based, Cost Based
Router provides 4 strategies for routing your calls across multiple deployments: Router provides 4 strategies for routing your calls across multiple deployments:
@ -467,6 +467,101 @@ async def router_acompletion():
asyncio.run(router_acompletion()) asyncio.run(router_acompletion())
``` ```
</TabItem>
<TabItem value="lowest-cost" label="Lowest Cost Routing (Async)">
Picks a deployment based on the lowest cost
How this works:
- Get all healthy deployments
- Select all deployments that are under their provided `rpm/tpm` limits
- For each deployment check if `litellm_param["model"]` exists in [`litellm_model_cost_map`](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json)
- if deployment does not exist in `litellm_model_cost_map` -> use deployment_cost= `$1`
- Select deployment with lowest cost
```python
from litellm import Router
import asyncio
model_list = [
{
"model_name": "gpt-3.5-turbo",
"litellm_params": {"model": "gpt-4"},
"model_info": {"id": "openai-gpt-4"},
},
{
"model_name": "gpt-3.5-turbo",
"litellm_params": {"model": "groq/llama3-8b-8192"},
"model_info": {"id": "groq-llama"},
},
]
# init router
router = Router(model_list=model_list, routing_strategy="cost-based-routing")
async def router_acompletion():
response = await router.acompletion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Hey, how's it going?"}]
)
print(response)
print(response._hidden_params["model_id"]) # expect groq-llama, since groq/llama has lowest cost
return response
asyncio.run(router_acompletion())
```
#### Using Custom Input/Output pricing
Set `litellm_params["input_cost_per_token"]` and `litellm_params["output_cost_per_token"]` for using custom pricing when routing
```python
model_list = [
{
"model_name": "gpt-3.5-turbo",
"litellm_params": {
"model": "azure/chatgpt-v-2",
"input_cost_per_token": 0.00003,
"output_cost_per_token": 0.00003,
},
"model_info": {"id": "chatgpt-v-experimental"},
},
{
"model_name": "gpt-3.5-turbo",
"litellm_params": {
"model": "azure/chatgpt-v-1",
"input_cost_per_token": 0.000000001,
"output_cost_per_token": 0.00000001,
},
"model_info": {"id": "chatgpt-v-1"},
},
{
"model_name": "gpt-3.5-turbo",
"litellm_params": {
"model": "azure/chatgpt-v-5",
"input_cost_per_token": 10,
"output_cost_per_token": 12,
},
"model_info": {"id": "chatgpt-v-5"},
},
]
# init router
router = Router(model_list=model_list, routing_strategy="cost-based-routing")
async def router_acompletion():
response = await router.acompletion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Hey, how's it going?"}]
)
print(response)
print(response._hidden_params["model_id"]) # expect chatgpt-v-1, since chatgpt-v-1 has lowest cost
return response
asyncio.run(router_acompletion())
```
</TabItem> </TabItem>
</Tabs> </Tabs>
@ -991,6 +1086,46 @@ async def test_acompletion_caching_on_router_caching_groups():
asyncio.run(test_acompletion_caching_on_router_caching_groups()) asyncio.run(test_acompletion_caching_on_router_caching_groups())
``` ```
## Alerting 🚨
Send alerts to slack / your webhook url for the following events
- LLM API Exceptions
- Slow LLM Responses
Get a slack webhook url from https://api.slack.com/messaging/webhooks
#### Usage
Initialize an `AlertingConfig` and pass it to `litellm.Router`. The following code will trigger an alert because `api_key=bad-key` which is invalid
```python
from litellm.router import AlertingConfig
import litellm
import os
router = litellm.Router(
model_list=[
{
"model_name": "gpt-3.5-turbo",
"litellm_params": {
"model": "gpt-3.5-turbo",
"api_key": "bad_key",
},
}
],
alerting_config= AlertingConfig(
alerting_threshold=10, # threshold for slow / hanging llm responses (in seconds). Defaults to 300 seconds
webhook_url= os.getenv("SLACK_WEBHOOK_URL") # webhook you want to send alerts to
),
)
try:
await router.acompletion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Hey, how's it going?"}],
)
except:
pass
```
## Track cost for Azure Deployments ## Track cost for Azure Deployments
**Problem**: Azure returns `gpt-4` in the response when `azure/gpt-4-1106-preview` is used. This leads to inaccurate cost tracking **Problem**: Azure returns `gpt-4` in the response when `azure/gpt-4-1106-preview` is used. This leads to inaccurate cost tracking
@ -1159,6 +1294,7 @@ def __init__(
"least-busy", "least-busy",
"usage-based-routing", "usage-based-routing",
"latency-based-routing", "latency-based-routing",
"cost-based-routing",
] = "simple-shuffle", ] = "simple-shuffle",
## DEBUGGING ## ## DEBUGGING ##

View file

@ -134,6 +134,7 @@ const sidebars = {
"providers/ollama", "providers/ollama",
"providers/perplexity", "providers/perplexity",
"providers/groq", "providers/groq",
"providers/deepseek",
"providers/fireworks_ai", "providers/fireworks_ai",
"providers/vllm", "providers/vllm",
"providers/xinference", "providers/xinference",

View file

@ -291,7 +291,7 @@ def _create_clickhouse_aggregate_tables(client=None, table_names=[]):
def _forecast_daily_cost(data: list): def _forecast_daily_cost(data: list):
import requests import requests # type: ignore
from datetime import datetime, timedelta from datetime import datetime, timedelta
if len(data) == 0: if len(data) == 0:

View file

@ -361,6 +361,7 @@ openai_compatible_endpoints: List = [
"api.deepinfra.com/v1/openai", "api.deepinfra.com/v1/openai",
"api.mistral.ai/v1", "api.mistral.ai/v1",
"api.groq.com/openai/v1", "api.groq.com/openai/v1",
"api.deepseek.com/v1",
"api.together.xyz/v1", "api.together.xyz/v1",
] ]
@ -369,6 +370,7 @@ openai_compatible_providers: List = [
"anyscale", "anyscale",
"mistral", "mistral",
"groq", "groq",
"deepseek",
"deepinfra", "deepinfra",
"perplexity", "perplexity",
"xinference", "xinference",
@ -523,6 +525,7 @@ provider_list: List = [
"anyscale", "anyscale",
"mistral", "mistral",
"groq", "groq",
"deepseek",
"maritalk", "maritalk",
"voyage", "voyage",
"cloudflare", "cloudflare",

View file

@ -10,8 +10,8 @@
# s/o [@Frank Colson](https://www.linkedin.com/in/frank-colson-422b9b183/) for this redis implementation # s/o [@Frank Colson](https://www.linkedin.com/in/frank-colson-422b9b183/) for this redis implementation
import os import os
import inspect import inspect
import redis, litellm import redis, litellm # type: ignore
import redis.asyncio as async_redis import redis.asyncio as async_redis # type: ignore
from typing import List, Optional from typing import List, Optional

View file

@ -10,7 +10,7 @@
import os, json, time import os, json, time
import litellm import litellm
from litellm.utils import ModelResponse from litellm.utils import ModelResponse
import requests, threading import requests, threading # type: ignore
from typing import Optional, Union, Literal from typing import Optional, Union, Literal

View file

@ -106,7 +106,7 @@ class InMemoryCache(BaseCache):
return_val.append(val) return_val.append(val)
return return_val return return_val
async def async_increment(self, key, value: int, **kwargs) -> int: async def async_increment(self, key, value: float, **kwargs) -> float:
# get the value # get the value
init_value = await self.async_get_cache(key=key) or 0 init_value = await self.async_get_cache(key=key) or 0
value = init_value + value value = init_value + value
@ -423,12 +423,12 @@ class RedisCache(BaseCache):
if len(self.redis_batch_writing_buffer) >= self.redis_flush_size: if len(self.redis_batch_writing_buffer) >= self.redis_flush_size:
await self.flush_cache_buffer() # logging done in here await self.flush_cache_buffer() # logging done in here
async def async_increment(self, key, value: int, **kwargs) -> int: async def async_increment(self, key, value: float, **kwargs) -> float:
_redis_client = self.init_async_client() _redis_client = self.init_async_client()
start_time = time.time() start_time = time.time()
try: try:
async with _redis_client as redis_client: async with _redis_client as redis_client:
result = await redis_client.incr(name=key, amount=value) result = await redis_client.incrbyfloat(name=key, amount=value)
## LOGGING ## ## LOGGING ##
end_time = time.time() end_time = time.time()
_duration = end_time - start_time _duration = end_time - start_time
@ -1382,18 +1382,41 @@ class DualCache(BaseCache):
print_verbose(f"LiteLLM Cache: Excepton async add_cache: {str(e)}") print_verbose(f"LiteLLM Cache: Excepton async add_cache: {str(e)}")
traceback.print_exc() traceback.print_exc()
async def async_batch_set_cache(
self, cache_list: list, local_only: bool = False, **kwargs
):
"""
Batch write values to the cache
"""
print_verbose(
f"async batch set cache: cache keys: {cache_list}; local_only: {local_only}"
)
try:
if self.in_memory_cache is not None:
await self.in_memory_cache.async_set_cache_pipeline(
cache_list=cache_list, **kwargs
)
if self.redis_cache is not None and local_only == False:
await self.redis_cache.async_set_cache_pipeline(
cache_list=cache_list, ttl=kwargs.get("ttl", None)
)
except Exception as e:
print_verbose(f"LiteLLM Cache: Excepton async add_cache: {str(e)}")
traceback.print_exc()
async def async_increment_cache( async def async_increment_cache(
self, key, value: int, local_only: bool = False, **kwargs self, key, value: float, local_only: bool = False, **kwargs
) -> int: ) -> float:
""" """
Key - the key in cache Key - the key in cache
Value - int - the value you want to increment by Value - float - the value you want to increment by
Returns - int - the incremented value Returns - float - the incremented value
""" """
try: try:
result: int = value result: float = value
if self.in_memory_cache is not None: if self.in_memory_cache is not None:
result = await self.in_memory_cache.async_increment( result = await self.in_memory_cache.async_increment(
key, value, **kwargs key, value, **kwargs

View file

@ -1,7 +1,6 @@
#### What this does #### #### What this does ####
# On success + failure, log events to aispend.io # On success + failure, log events to aispend.io
import dotenv, os import dotenv, os
import requests
dotenv.load_dotenv() # Loading env variables using dotenv dotenv.load_dotenv() # Loading env variables using dotenv
import traceback import traceback

View file

@ -4,18 +4,30 @@ import datetime
class AthinaLogger: class AthinaLogger:
def __init__(self): def __init__(self):
import os import os
self.athina_api_key = os.getenv("ATHINA_API_KEY") self.athina_api_key = os.getenv("ATHINA_API_KEY")
self.headers = { self.headers = {
"athina-api-key": self.athina_api_key, "athina-api-key": self.athina_api_key,
"Content-Type": "application/json" "Content-Type": "application/json",
} }
self.athina_logging_url = "https://log.athina.ai/api/v1/log/inference" self.athina_logging_url = "https://log.athina.ai/api/v1/log/inference"
self.additional_keys = ["environment", "prompt_slug", "customer_id", "customer_user_id", "session_id", "external_reference_id", "context", "expected_response", "user_query"] self.additional_keys = [
"environment",
"prompt_slug",
"customer_id",
"customer_user_id",
"session_id",
"external_reference_id",
"context",
"expected_response",
"user_query",
]
def log_event(self, kwargs, response_obj, start_time, end_time, print_verbose): def log_event(self, kwargs, response_obj, start_time, end_time, print_verbose):
import requests import requests # type: ignore
import json import json
import traceback import traceback
try: try:
response_json = response_obj.model_dump() if response_obj else {} response_json = response_obj.model_dump() if response_obj else {}
data = { data = {
@ -23,19 +35,30 @@ class AthinaLogger:
"request": kwargs, "request": kwargs,
"response": response_json, "response": response_json,
"prompt_tokens": response_json.get("usage", {}).get("prompt_tokens"), "prompt_tokens": response_json.get("usage", {}).get("prompt_tokens"),
"completion_tokens": response_json.get("usage", {}).get("completion_tokens"), "completion_tokens": response_json.get("usage", {}).get(
"completion_tokens"
),
"total_tokens": response_json.get("usage", {}).get("total_tokens"), "total_tokens": response_json.get("usage", {}).get("total_tokens"),
} }
if type(end_time) == datetime.datetime and type(start_time) == datetime.datetime: if (
data["response_time"] = int((end_time - start_time).total_seconds() * 1000) type(end_time) == datetime.datetime
and type(start_time) == datetime.datetime
):
data["response_time"] = int(
(end_time - start_time).total_seconds() * 1000
)
if "messages" in kwargs: if "messages" in kwargs:
data["prompt"] = kwargs.get("messages", None) data["prompt"] = kwargs.get("messages", None)
# Directly add tools or functions if present # Directly add tools or functions if present
optional_params = kwargs.get("optional_params", {}) optional_params = kwargs.get("optional_params", {})
data.update((k, v) for k, v in optional_params.items() if k in ["tools", "functions"]) data.update(
(k, v)
for k, v in optional_params.items()
if k in ["tools", "functions"]
)
# Add additional metadata keys # Add additional metadata keys
metadata = kwargs.get("litellm_params", {}).get("metadata", {}) metadata = kwargs.get("litellm_params", {}).get("metadata", {})
@ -44,11 +67,19 @@ class AthinaLogger:
if key in metadata: if key in metadata:
data[key] = metadata[key] data[key] = metadata[key]
response = requests.post(self.athina_logging_url, headers=self.headers, data=json.dumps(data, default=str)) response = requests.post(
self.athina_logging_url,
headers=self.headers,
data=json.dumps(data, default=str),
)
if response.status_code != 200: if response.status_code != 200:
print_verbose(f"Athina Logger Error - {response.text}, {response.status_code}") print_verbose(
f"Athina Logger Error - {response.text}, {response.status_code}"
)
else: else:
print_verbose(f"Athina Logger Succeeded - {response.text}") print_verbose(f"Athina Logger Succeeded - {response.text}")
except Exception as e: except Exception as e:
print_verbose(f"Athina Logger Error - {e}, Stack trace: {traceback.format_exc()}") print_verbose(
f"Athina Logger Error - {e}, Stack trace: {traceback.format_exc()}"
)
pass pass

View file

@ -1,7 +1,7 @@
#### What this does #### #### What this does ####
# On success + failure, log events to aispend.io # On success + failure, log events to aispend.io
import dotenv, os import dotenv, os
import requests import requests # type: ignore
dotenv.load_dotenv() # Loading env variables using dotenv dotenv.load_dotenv() # Loading env variables using dotenv
import traceback import traceback

View file

@ -3,7 +3,6 @@
#### What this does #### #### What this does ####
# On success, logs events to Promptlayer # On success, logs events to Promptlayer
import dotenv, os import dotenv, os
import requests
from litellm.proxy._types import UserAPIKeyAuth from litellm.proxy._types import UserAPIKeyAuth
from litellm.caching import DualCache from litellm.caching import DualCache

View file

@ -1,7 +1,6 @@
#### What this does #### #### What this does ####
# On success, logs events to Promptlayer # On success, logs events to Promptlayer
import dotenv, os import dotenv, os
import requests
from litellm.proxy._types import UserAPIKeyAuth from litellm.proxy._types import UserAPIKeyAuth
from litellm.caching import DualCache from litellm.caching import DualCache

View file

@ -2,7 +2,7 @@
# On success + failure, log events to Supabase # On success + failure, log events to Supabase
import dotenv, os import dotenv, os
import requests import requests # type: ignore
dotenv.load_dotenv() # Loading env variables using dotenv dotenv.load_dotenv() # Loading env variables using dotenv
import traceback import traceback

View file

@ -2,7 +2,7 @@
# On success + failure, log events to Supabase # On success + failure, log events to Supabase
import dotenv, os import dotenv, os
import requests import requests # type: ignore
dotenv.load_dotenv() # Loading env variables using dotenv dotenv.load_dotenv() # Loading env variables using dotenv
import traceback import traceback

View file

@ -1,15 +1,17 @@
import requests import requests # type: ignore
import json import json
import traceback import traceback
from datetime import datetime, timezone from datetime import datetime, timezone
class GreenscaleLogger: class GreenscaleLogger:
def __init__(self): def __init__(self):
import os import os
self.greenscale_api_key = os.getenv("GREENSCALE_API_KEY") self.greenscale_api_key = os.getenv("GREENSCALE_API_KEY")
self.headers = { self.headers = {
"api-key": self.greenscale_api_key, "api-key": self.greenscale_api_key,
"Content-Type": "application/json" "Content-Type": "application/json",
} }
self.greenscale_logging_url = os.getenv("GREENSCALE_ENDPOINT") self.greenscale_logging_url = os.getenv("GREENSCALE_ENDPOINT")
@ -19,13 +21,18 @@ class GreenscaleLogger:
data = { data = {
"modelId": kwargs.get("model"), "modelId": kwargs.get("model"),
"inputTokenCount": response_json.get("usage", {}).get("prompt_tokens"), "inputTokenCount": response_json.get("usage", {}).get("prompt_tokens"),
"outputTokenCount": response_json.get("usage", {}).get("completion_tokens"), "outputTokenCount": response_json.get("usage", {}).get(
"completion_tokens"
),
} }
data["timestamp"] = datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ') data["timestamp"] = datetime.now(timezone.utc).strftime(
"%Y-%m-%dT%H:%M:%SZ"
)
if type(end_time) == datetime and type(start_time) == datetime: if type(end_time) == datetime and type(start_time) == datetime:
data["invocationLatency"] = int((end_time - start_time).total_seconds() * 1000) data["invocationLatency"] = int(
(end_time - start_time).total_seconds() * 1000
)
# Add additional metadata keys to tags # Add additional metadata keys to tags
tags = [] tags = []
@ -37,15 +44,25 @@ class GreenscaleLogger:
elif key == "greenscale_application": elif key == "greenscale_application":
data["application"] = value data["application"] = value
else: else:
tags.append({"key": key.replace("greenscale_", ""), "value": str(value)}) tags.append(
{"key": key.replace("greenscale_", ""), "value": str(value)}
)
data["tags"] = tags data["tags"] = tags
response = requests.post(self.greenscale_logging_url, headers=self.headers, data=json.dumps(data, default=str)) response = requests.post(
self.greenscale_logging_url,
headers=self.headers,
data=json.dumps(data, default=str),
)
if response.status_code != 200: if response.status_code != 200:
print_verbose(f"Greenscale Logger Error - {response.text}, {response.status_code}") print_verbose(
f"Greenscale Logger Error - {response.text}, {response.status_code}"
)
else: else:
print_verbose(f"Greenscale Logger Succeeded - {response.text}") print_verbose(f"Greenscale Logger Succeeded - {response.text}")
except Exception as e: except Exception as e:
print_verbose(f"Greenscale Logger Error - {e}, Stack trace: {traceback.format_exc()}") print_verbose(
f"Greenscale Logger Error - {e}, Stack trace: {traceback.format_exc()}"
)
pass pass

View file

@ -1,7 +1,7 @@
#### What this does #### #### What this does ####
# On success, logs events to Helicone # On success, logs events to Helicone
import dotenv, os import dotenv, os
import requests import requests # type: ignore
import litellm import litellm
dotenv.load_dotenv() # Loading env variables using dotenv dotenv.load_dotenv() # Loading env variables using dotenv

View file

@ -262,6 +262,7 @@ class LangFuseLogger:
try: try:
tags = [] tags = []
metadata = copy.deepcopy(metadata) # Avoid modifying the original metadata
supports_tags = Version(langfuse.version.__version__) >= Version("2.6.3") supports_tags = Version(langfuse.version.__version__) >= Version("2.6.3")
supports_prompt = Version(langfuse.version.__version__) >= Version("2.7.3") supports_prompt = Version(langfuse.version.__version__) >= Version("2.7.3")
supports_costs = Version(langfuse.version.__version__) >= Version("2.7.3") supports_costs = Version(langfuse.version.__version__) >= Version("2.7.3")
@ -272,36 +273,9 @@ class LangFuseLogger:
print_verbose(f"Langfuse Layer Logging - logging to langfuse v2 ") print_verbose(f"Langfuse Layer Logging - logging to langfuse v2 ")
if supports_tags: if supports_tags:
metadata_tags = metadata.get("tags", []) metadata_tags = metadata.pop("tags", [])
tags = metadata_tags tags = metadata_tags
trace_name = metadata.get("trace_name", None)
trace_id = metadata.get("trace_id", None)
existing_trace_id = metadata.get("existing_trace_id", None)
if trace_name is None and existing_trace_id is None:
# just log `litellm-{call_type}` as the trace name
## DO NOT SET TRACE_NAME if trace-id set. this can lead to overwriting of past traces.
trace_name = f"litellm-{kwargs.get('call_type', 'completion')}"
if existing_trace_id is not None:
trace_params = {"id": existing_trace_id}
else: # don't overwrite an existing trace
trace_params = {
"name": trace_name,
"input": input,
"user_id": metadata.get("trace_user_id", user_id),
"id": trace_id,
"session_id": metadata.get("session_id", None),
}
if level == "ERROR":
trace_params["status_message"] = output
else:
trace_params["output"] = output
cost = kwargs.get("response_cost", None)
print_verbose(f"trace: {cost}")
# Clean Metadata before logging - never log raw metadata # Clean Metadata before logging - never log raw metadata
# the raw metadata can contain circular references which leads to infinite recursion # the raw metadata can contain circular references which leads to infinite recursion
# we clean out all extra litellm metadata params before logging # we clean out all extra litellm metadata params before logging
@ -328,6 +302,66 @@ class LangFuseLogger:
else: else:
clean_metadata[key] = value clean_metadata[key] = value
session_id = clean_metadata.pop("session_id", None)
trace_name = clean_metadata.pop("trace_name", None)
trace_id = clean_metadata.pop("trace_id", None)
existing_trace_id = clean_metadata.pop("existing_trace_id", None)
update_trace_keys = clean_metadata.pop("update_trace_keys", [])
if trace_name is None and existing_trace_id is None:
# just log `litellm-{call_type}` as the trace name
## DO NOT SET TRACE_NAME if trace-id set. this can lead to overwriting of past traces.
trace_name = f"litellm-{kwargs.get('call_type', 'completion')}"
if existing_trace_id is not None:
trace_params = {"id": existing_trace_id}
# Update the following keys for this trace
for metadata_param_key in update_trace_keys:
trace_param_key = metadata_param_key.replace("trace_", "")
if trace_param_key not in trace_params:
updated_trace_value = clean_metadata.pop(
metadata_param_key, None
)
if updated_trace_value is not None:
trace_params[trace_param_key] = updated_trace_value
# Pop the trace specific keys that would have been popped if there were a new trace
for key in list(
filter(lambda key: key.startswith("trace_"), clean_metadata.keys())
):
clean_metadata.pop(key, None)
# Special keys that are found in the function arguments and not the metadata
if "input" in update_trace_keys:
trace_params["input"] = input
if "output" in update_trace_keys:
trace_params["output"] = output
else: # don't overwrite an existing trace
trace_params = {
"id": trace_id,
"name": trace_name,
"session_id": session_id,
"input": input,
"version": clean_metadata.pop(
"trace_version", clean_metadata.get("version", None)
), # If provided just version, it will applied to the trace as well, if applied a trace version it will take precedence
}
for key in list(
filter(lambda key: key.startswith("trace_"), clean_metadata.keys())
):
trace_params[key.replace("trace_", "")] = clean_metadata.pop(
key, None
)
if level == "ERROR":
trace_params["status_message"] = output
else:
trace_params["output"] = output
cost = kwargs.get("response_cost", None)
print_verbose(f"trace: {cost}")
if ( if (
litellm._langfuse_default_tags is not None litellm._langfuse_default_tags is not None
and isinstance(litellm._langfuse_default_tags, list) and isinstance(litellm._langfuse_default_tags, list)
@ -387,7 +421,7 @@ class LangFuseLogger:
"completion_tokens": response_obj["usage"]["completion_tokens"], "completion_tokens": response_obj["usage"]["completion_tokens"],
"total_cost": cost if supports_costs else None, "total_cost": cost if supports_costs else None,
} }
generation_name = metadata.get("generation_name", None) generation_name = clean_metadata.pop("generation_name", None)
if generation_name is None: if generation_name is None:
# just log `litellm-{call_type}` as the generation name # just log `litellm-{call_type}` as the generation name
generation_name = f"litellm-{kwargs.get('call_type', 'completion')}" generation_name = f"litellm-{kwargs.get('call_type', 'completion')}"
@ -402,7 +436,7 @@ class LangFuseLogger:
generation_params = { generation_params = {
"name": generation_name, "name": generation_name,
"id": metadata.get("generation_id", generation_id), "id": clean_metadata.pop("generation_id", generation_id),
"start_time": start_time, "start_time": start_time,
"end_time": end_time, "end_time": end_time,
"model": kwargs["model"], "model": kwargs["model"],
@ -412,10 +446,11 @@ class LangFuseLogger:
"usage": usage, "usage": usage,
"metadata": clean_metadata, "metadata": clean_metadata,
"level": level, "level": level,
"version": clean_metadata.pop("version", None),
} }
if supports_prompt: if supports_prompt:
generation_params["prompt"] = metadata.get("prompt", None) generation_params["prompt"] = clean_metadata.pop("prompt", None)
if output is not None and isinstance(output, str) and level == "ERROR": if output is not None and isinstance(output, str) and level == "ERROR":
generation_params["status_message"] = output generation_params["status_message"] = output

View file

@ -1,15 +1,14 @@
#### What this does #### #### What this does ####
# On success, logs events to Langsmith # On success, logs events to Langsmith
import dotenv, os import dotenv, os # type: ignore
import requests import requests # type: ignore
import requests
from datetime import datetime from datetime import datetime
dotenv.load_dotenv() # Loading env variables using dotenv dotenv.load_dotenv() # Loading env variables using dotenv
import traceback import traceback
import asyncio import asyncio
import types import types
from pydantic import BaseModel from pydantic import BaseModel # type: ignore
def is_serializable(value): def is_serializable(value):
@ -79,8 +78,6 @@ class LangsmithLogger:
except: except:
response_obj = response_obj.dict() # type: ignore response_obj = response_obj.dict() # type: ignore
print(f"response_obj: {response_obj}")
data = { data = {
"name": run_name, "name": run_name,
"run_type": "llm", # this should always be llm, since litellm always logs llm calls. Langsmith allow us to log "chain" "run_type": "llm", # this should always be llm, since litellm always logs llm calls. Langsmith allow us to log "chain"
@ -90,7 +87,6 @@ class LangsmithLogger:
"start_time": start_time, "start_time": start_time,
"end_time": end_time, "end_time": end_time,
} }
print(f"data: {data}")
response = requests.post( response = requests.post(
"https://api.smith.langchain.com/runs", "https://api.smith.langchain.com/runs",

View file

@ -2,7 +2,6 @@
## On Success events log cost to OpenMeter - https://github.com/BerriAI/litellm/issues/1268 ## On Success events log cost to OpenMeter - https://github.com/BerriAI/litellm/issues/1268
import dotenv, os, json import dotenv, os, json
import requests
import litellm import litellm
dotenv.load_dotenv() # Loading env variables using dotenv dotenv.load_dotenv() # Loading env variables using dotenv
@ -60,7 +59,7 @@ class OpenMeterLogger(CustomLogger):
"total_tokens": response_obj["usage"].get("total_tokens"), "total_tokens": response_obj["usage"].get("total_tokens"),
} }
subject = kwargs.get("user", None), # end-user passed in via 'user' param subject = (kwargs.get("user", None),) # end-user passed in via 'user' param
if not subject: if not subject:
raise Exception("OpenMeter: user is required") raise Exception("OpenMeter: user is required")

View file

@ -3,7 +3,7 @@
# On success, log events to Prometheus # On success, log events to Prometheus
import dotenv, os import dotenv, os
import requests import requests # type: ignore
dotenv.load_dotenv() # Loading env variables using dotenv dotenv.load_dotenv() # Loading env variables using dotenv
import traceback import traceback
@ -19,7 +19,6 @@ class PrometheusLogger:
**kwargs, **kwargs,
): ):
try: try:
print(f"in init prometheus metrics")
from prometheus_client import Counter from prometheus_client import Counter
self.litellm_llm_api_failed_requests_metric = Counter( self.litellm_llm_api_failed_requests_metric = Counter(

View file

@ -4,7 +4,7 @@
import dotenv, os import dotenv, os
import requests import requests # type: ignore
dotenv.load_dotenv() # Loading env variables using dotenv dotenv.load_dotenv() # Loading env variables using dotenv
import traceback import traceback
@ -183,7 +183,6 @@ class PrometheusServicesLogger:
) )
async def async_service_failure_hook(self, payload: ServiceLoggerPayload): async def async_service_failure_hook(self, payload: ServiceLoggerPayload):
print(f"received error payload: {payload.error}")
if self.mock_testing: if self.mock_testing:
self.mock_testing_failure_calls += 1 self.mock_testing_failure_calls += 1

View file

@ -1,12 +1,13 @@
#### What this does #### #### What this does ####
# On success, logs events to Promptlayer # On success, logs events to Promptlayer
import dotenv, os import dotenv, os
import requests import requests # type: ignore
from pydantic import BaseModel from pydantic import BaseModel
dotenv.load_dotenv() # Loading env variables using dotenv dotenv.load_dotenv() # Loading env variables using dotenv
import traceback import traceback
class PromptLayerLogger: class PromptLayerLogger:
# Class variables or attributes # Class variables or attributes
def __init__(self): def __init__(self):
@ -32,7 +33,11 @@ class PromptLayerLogger:
tags = kwargs["litellm_params"]["metadata"]["pl_tags"] tags = kwargs["litellm_params"]["metadata"]["pl_tags"]
# Remove "pl_tags" from metadata # Remove "pl_tags" from metadata
metadata = {k:v for k, v in kwargs["litellm_params"]["metadata"].items() if k != "pl_tags"} metadata = {
k: v
for k, v in kwargs["litellm_params"]["metadata"].items()
if k != "pl_tags"
}
print_verbose( print_verbose(
f"Prompt Layer Logging - Enters logging function for model kwargs: {new_kwargs}\n, response: {response_obj}" f"Prompt Layer Logging - Enters logging function for model kwargs: {new_kwargs}\n, response: {response_obj}"

View file

@ -2,7 +2,6 @@
# On success + failure, log events to Supabase # On success + failure, log events to Supabase
import dotenv, os import dotenv, os
import requests
dotenv.load_dotenv() # Loading env variables using dotenv dotenv.load_dotenv() # Loading env variables using dotenv
import traceback import traceback

View file

@ -1,25 +1,82 @@
#### What this does #### #### What this does ####
# Class for sending Slack Alerts # # Class for sending Slack Alerts #
import dotenv, os import dotenv, os
from litellm.proxy._types import UserAPIKeyAuth
dotenv.load_dotenv() # Loading env variables using dotenv dotenv.load_dotenv() # Loading env variables using dotenv
import copy
import traceback
from litellm._logging import verbose_logger, verbose_proxy_logger from litellm._logging import verbose_logger, verbose_proxy_logger
import litellm import litellm, threading
from typing import List, Literal, Any, Union, Optional, Dict from typing import List, Literal, Any, Union, Optional, Dict
from litellm.caching import DualCache from litellm.caching import DualCache
import asyncio import asyncio
import aiohttp import aiohttp
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
import datetime import datetime
from pydantic import BaseModel
from enum import Enum
from datetime import datetime as dt, timedelta
from litellm.integrations.custom_logger import CustomLogger
import random
class SlackAlerting: class LiteLLMBase(BaseModel):
"""
Implements default functions, all pydantic objects should have.
"""
def json(self, **kwargs):
try:
return self.model_dump() # noqa
except:
# if using pydantic v1
return self.dict()
class SlackAlertingArgs(LiteLLMBase):
daily_report_frequency: int = 12 * 60 * 60 # 12 hours
report_check_interval: int = 5 * 60 # 5 minutes
class DeploymentMetrics(LiteLLMBase):
"""
Metrics per deployment, stored in cache
Used for daily reporting
"""
id: str
"""id of deployment in router model list"""
failed_request: bool
"""did it fail the request?"""
latency_per_output_token: Optional[float]
"""latency/output token of deployment"""
updated_at: dt
"""Current time of deployment being updated"""
class SlackAlertingCacheKeys(Enum):
"""
Enum for deployment daily metrics keys - {deployment_id}:{enum}
"""
failed_requests_key = "failed_requests_daily_metrics"
latency_key = "latency_daily_metrics"
report_sent_key = "daily_metrics_report_sent"
class SlackAlerting(CustomLogger):
"""
Class for sending Slack Alerts
"""
# Class variables or attributes # Class variables or attributes
def __init__( def __init__(
self, self,
alerting_threshold: float = 300, internal_usage_cache: Optional[DualCache] = None,
alerting_threshold: float = 300, # threshold for slow / hanging llm responses (in seconds)
alerting: Optional[List] = [], alerting: Optional[List] = [],
alert_types: Optional[ alert_types: Optional[
List[ List[
@ -29,6 +86,7 @@ class SlackAlerting:
"llm_requests_hanging", "llm_requests_hanging",
"budget_alerts", "budget_alerts",
"db_exceptions", "db_exceptions",
"daily_reports",
] ]
] ]
] = [ ] = [
@ -37,18 +95,23 @@ class SlackAlerting:
"llm_requests_hanging", "llm_requests_hanging",
"budget_alerts", "budget_alerts",
"db_exceptions", "db_exceptions",
"daily_reports",
], ],
alert_to_webhook_url: Optional[ alert_to_webhook_url: Optional[
Dict Dict
] = None, # if user wants to separate alerts to diff channels ] = None, # if user wants to separate alerts to diff channels
alerting_args={},
default_webhook_url: Optional[str] = None,
): ):
self.alerting_threshold = alerting_threshold self.alerting_threshold = alerting_threshold
self.alerting = alerting self.alerting = alerting
self.alert_types = alert_types self.alert_types = alert_types
self.internal_usage_cache = DualCache() self.internal_usage_cache = internal_usage_cache or DualCache()
self.async_http_handler = AsyncHTTPHandler() self.async_http_handler = AsyncHTTPHandler()
self.alert_to_webhook_url = alert_to_webhook_url self.alert_to_webhook_url = alert_to_webhook_url
pass self.is_running = False
self.alerting_args = SlackAlertingArgs(**alerting_args)
self.default_webhook_url = default_webhook_url
def update_values( def update_values(
self, self,
@ -56,6 +119,7 @@ class SlackAlerting:
alerting_threshold: Optional[float] = None, alerting_threshold: Optional[float] = None,
alert_types: Optional[List] = None, alert_types: Optional[List] = None,
alert_to_webhook_url: Optional[Dict] = None, alert_to_webhook_url: Optional[Dict] = None,
alerting_args: Optional[Dict] = None,
): ):
if alerting is not None: if alerting is not None:
self.alerting = alerting self.alerting = alerting
@ -63,7 +127,8 @@ class SlackAlerting:
self.alerting_threshold = alerting_threshold self.alerting_threshold = alerting_threshold
if alert_types is not None: if alert_types is not None:
self.alert_types = alert_types self.alert_types = alert_types
if alerting_args is not None:
self.alerting_args = SlackAlertingArgs(**alerting_args)
if alert_to_webhook_url is not None: if alert_to_webhook_url is not None:
# update the dict # update the dict
if self.alert_to_webhook_url is None: if self.alert_to_webhook_url is None:
@ -90,18 +155,23 @@ class SlackAlerting:
def _add_langfuse_trace_id_to_alert( def _add_langfuse_trace_id_to_alert(
self, self,
request_info: str,
request_data: Optional[dict] = None, request_data: Optional[dict] = None,
kwargs: Optional[dict] = None, ) -> Optional[str]:
type: Literal["hanging_request", "slow_response"] = "hanging_request", """
start_time: Optional[datetime.datetime] = None, Returns langfuse trace url
end_time: Optional[datetime.datetime] = None, """
):
# do nothing for now # do nothing for now
pass if (
return request_info request_data is not None
and request_data.get("metadata", {}).get("trace_id", None) is not None
):
trace_id = request_data["metadata"]["trace_id"]
if litellm.utils.langFuseLogger is not None:
base_url = litellm.utils.langFuseLogger.Langfuse.base_url
return f"{base_url}/trace/{trace_id}"
return None
def _response_taking_too_long_callback( def _response_taking_too_long_callback_helper(
self, self,
kwargs, # kwargs to completion kwargs, # kwargs to completion
start_time, start_time,
@ -166,7 +236,7 @@ class SlackAlerting:
return return
time_difference_float, model, api_base, messages = ( time_difference_float, model, api_base, messages = (
self._response_taking_too_long_callback( self._response_taking_too_long_callback_helper(
kwargs=kwargs, kwargs=kwargs,
start_time=start_time, start_time=start_time,
end_time=end_time, end_time=end_time,
@ -182,6 +252,9 @@ class SlackAlerting:
and "metadata" in kwargs["litellm_params"] and "metadata" in kwargs["litellm_params"]
): ):
_metadata = kwargs["litellm_params"]["metadata"] _metadata = kwargs["litellm_params"]["metadata"]
request_info = litellm.utils._add_key_name_and_team_to_alert(
request_info=request_info, metadata=_metadata
)
_deployment_latency_map = self._get_deployment_latencies_to_alert( _deployment_latency_map = self._get_deployment_latencies_to_alert(
metadata=_metadata metadata=_metadata
@ -196,8 +269,178 @@ class SlackAlerting:
alert_type="llm_too_slow", alert_type="llm_too_slow",
) )
async def log_failure_event(self, original_exception: Exception): async def async_update_daily_reports(
pass self, deployment_metrics: DeploymentMetrics
) -> int:
"""
Store the perf by deployment in cache
- Number of failed requests per deployment
- Latency / output tokens per deployment
'deployment_id:daily_metrics:failed_requests'
'deployment_id:daily_metrics:latency_per_output_token'
Returns
int - count of metrics set (1 - if just latency, 2 - if failed + latency)
"""
return_val = 0
try:
## FAILED REQUESTS ##
if deployment_metrics.failed_request:
await self.internal_usage_cache.async_increment_cache(
key="{}:{}".format(
deployment_metrics.id,
SlackAlertingCacheKeys.failed_requests_key.value,
),
value=1,
)
return_val += 1
## LATENCY ##
if deployment_metrics.latency_per_output_token is not None:
await self.internal_usage_cache.async_increment_cache(
key="{}:{}".format(
deployment_metrics.id, SlackAlertingCacheKeys.latency_key.value
),
value=deployment_metrics.latency_per_output_token,
)
return_val += 1
return return_val
except Exception as e:
return 0
async def send_daily_reports(self, router) -> bool:
"""
Send a daily report on:
- Top 5 deployments with most failed requests
- Top 5 slowest deployments (normalized by latency/output tokens)
Get the value from redis cache (if available) or in-memory and send it
Cleanup:
- reset values in cache -> prevent memory leak
Returns:
True -> if successfuly sent
False -> if not sent
"""
ids = router.get_model_ids()
# get keys
failed_request_keys = [
"{}:{}".format(id, SlackAlertingCacheKeys.failed_requests_key.value)
for id in ids
]
latency_keys = [
"{}:{}".format(id, SlackAlertingCacheKeys.latency_key.value) for id in ids
]
combined_metrics_keys = failed_request_keys + latency_keys # reduce cache calls
combined_metrics_values = await self.internal_usage_cache.async_batch_get_cache(
keys=combined_metrics_keys
) # [1, 2, None, ..]
all_none = True
for val in combined_metrics_values:
if val is not None:
all_none = False
if all_none:
return False
failed_request_values = combined_metrics_values[
: len(failed_request_keys)
] # # [1, 2, None, ..]
latency_values = combined_metrics_values[len(failed_request_keys) :]
# find top 5 failed
## Replace None values with a placeholder value (-1 in this case)
placeholder_value = 0
replaced_failed_values = [
value if value is not None else placeholder_value
for value in failed_request_values
]
## Get the indices of top 5 keys with the highest numerical values (ignoring None values)
top_5_failed = sorted(
range(len(replaced_failed_values)),
key=lambda i: replaced_failed_values[i],
reverse=True,
)[:5]
# find top 5 slowest
# Replace None values with a placeholder value (-1 in this case)
placeholder_value = 0
replaced_slowest_values = [
value if value is not None else placeholder_value
for value in latency_values
]
# Get the indices of top 5 values with the highest numerical values (ignoring None values)
top_5_slowest = sorted(
range(len(replaced_slowest_values)),
key=lambda i: replaced_slowest_values[i],
reverse=True,
)[:5]
# format alert -> return the litellm model name + api base
message = f"\n\nHere are today's key metrics 📈: \n\n"
message += "\n\n*❗️ Top 5 Deployments with Most Failed Requests:*\n\n"
for i in range(len(top_5_failed)):
key = failed_request_keys[top_5_failed[i]].split(":")[0]
_deployment = router.get_model_info(key)
if isinstance(_deployment, dict):
deployment_name = _deployment["litellm_params"].get("model", "")
else:
return False
api_base = litellm.get_api_base(
model=deployment_name,
optional_params=(
_deployment["litellm_params"] if _deployment is not None else {}
),
)
if api_base is None:
api_base = ""
value = replaced_failed_values[top_5_failed[i]]
message += f"\t{i+1}. Deployment: `{deployment_name}`, Failed Requests: `{value}`, API Base: `{api_base}`\n"
message += "\n\n*😅 Top 5 Slowest Deployments:*\n\n"
for i in range(len(top_5_slowest)):
key = latency_keys[top_5_slowest[i]].split(":")[0]
_deployment = router.get_model_info(key)
if _deployment is not None:
deployment_name = _deployment["litellm_params"].get("model", "")
else:
deployment_name = ""
api_base = litellm.get_api_base(
model=deployment_name,
optional_params=(
_deployment["litellm_params"] if _deployment is not None else {}
),
)
value = round(replaced_slowest_values[top_5_slowest[i]], 3)
message += f"\t{i+1}. Deployment: `{deployment_name}`, Latency per output token: `{value}s/token`, API Base: `{api_base}`\n\n"
# cache cleanup -> reset values to 0
latency_cache_keys = [(key, 0) for key in latency_keys]
failed_request_cache_keys = [(key, 0) for key in failed_request_keys]
combined_metrics_cache_keys = latency_cache_keys + failed_request_cache_keys
await self.internal_usage_cache.async_batch_set_cache(
cache_list=combined_metrics_cache_keys
)
# send alert
await self.send_alert(message=message, level="Low", alert_type="daily_reports")
return True
async def response_taking_too_long( async def response_taking_too_long(
self, self,
@ -255,6 +498,11 @@ class SlackAlerting:
# in that case we fallback to the api base set in the request metadata # in that case we fallback to the api base set in the request metadata
_metadata = request_data["metadata"] _metadata = request_data["metadata"]
_api_base = _metadata.get("api_base", "") _api_base = _metadata.get("api_base", "")
request_info = litellm.utils._add_key_name_and_team_to_alert(
request_info=request_info, metadata=_metadata
)
if _api_base is None: if _api_base is None:
_api_base = "" _api_base = ""
request_info += f"\nAPI Base: `{_api_base}`" request_info += f"\nAPI Base: `{_api_base}`"
@ -264,14 +512,13 @@ class SlackAlerting:
) )
if "langfuse" in litellm.success_callback: if "langfuse" in litellm.success_callback:
request_info = self._add_langfuse_trace_id_to_alert( langfuse_url = self._add_langfuse_trace_id_to_alert(
request_info=request_info,
request_data=request_data, request_data=request_data,
type="hanging_request",
start_time=start_time,
end_time=end_time,
) )
if langfuse_url is not None:
request_info += "\n🪢 Langfuse Trace: {}".format(langfuse_url)
# add deployment latencies to alert # add deployment latencies to alert
_deployment_latency_map = self._get_deployment_latencies_to_alert( _deployment_latency_map = self._get_deployment_latencies_to_alert(
metadata=request_data.get("metadata", {}) metadata=request_data.get("metadata", {})
@ -404,6 +651,53 @@ class SlackAlerting:
return return
async def model_added_alert(self, model_name: str, litellm_model_name: str):
model_info = litellm.model_cost.get(litellm_model_name, {})
model_info_str = ""
for k, v in model_info.items():
if k == "input_cost_per_token" or k == "output_cost_per_token":
# when converting to string it should not be 1.63e-06
v = "{:.8f}".format(v)
model_info_str += f"{k}: {v}\n"
message = f"""
*🚅 New Model Added*
Model Name: `{model_name}`
Usage OpenAI Python SDK:
```
import openai
client = openai.OpenAI(
api_key="your_api_key",
base_url={os.getenv("PROXY_BASE_URL", "http://0.0.0.0:4000")}
)
response = client.chat.completions.create(
model="{model_name}", # model to send to the proxy
messages = [
{{
"role": "user",
"content": "this is a test request, write a short poem"
}}
]
)
```
Model Info:
```
{model_info_str}
```
"""
await self.send_alert(
message=message, level="Low", alert_type="new_model_added"
)
pass
async def model_removed_alert(self, model_name: str):
pass
async def send_alert( async def send_alert(
self, self,
message: str, message: str,
@ -414,7 +708,11 @@ class SlackAlerting:
"llm_requests_hanging", "llm_requests_hanging",
"budget_alerts", "budget_alerts",
"db_exceptions", "db_exceptions",
"daily_reports",
"new_model_added",
"cooldown_deployment",
], ],
**kwargs,
): ):
""" """
Alerting based on thresholds: - https://github.com/BerriAI/litellm/issues/1298 Alerting based on thresholds: - https://github.com/BerriAI/litellm/issues/1298
@ -439,9 +737,16 @@ class SlackAlerting:
# Get the current timestamp # Get the current timestamp
current_time = datetime.now().strftime("%H:%M:%S") current_time = datetime.now().strftime("%H:%M:%S")
_proxy_base_url = os.getenv("PROXY_BASE_URL", None) _proxy_base_url = os.getenv("PROXY_BASE_URL", None)
if alert_type == "daily_reports" or alert_type == "new_model_added":
formatted_message = message
else:
formatted_message = ( formatted_message = (
f"Level: `{level}`\nTimestamp: `{current_time}`\n\nMessage: {message}" f"Level: `{level}`\nTimestamp: `{current_time}`\n\nMessage: {message}"
) )
if kwargs:
for key, value in kwargs.items():
formatted_message += f"\n\n{key}: `{value}`\n\n"
if _proxy_base_url is not None: if _proxy_base_url is not None:
formatted_message += f"\n\nProxy URL: `{_proxy_base_url}`" formatted_message += f"\n\nProxy URL: `{_proxy_base_url}`"
@ -451,6 +756,8 @@ class SlackAlerting:
and alert_type in self.alert_to_webhook_url and alert_type in self.alert_to_webhook_url
): ):
slack_webhook_url = self.alert_to_webhook_url[alert_type] slack_webhook_url = self.alert_to_webhook_url[alert_type]
elif self.default_webhook_url is not None:
slack_webhook_url = self.default_webhook_url
else: else:
slack_webhook_url = os.getenv("SLACK_WEBHOOK_URL", None) slack_webhook_url = os.getenv("SLACK_WEBHOOK_URL", None)
@ -468,3 +775,113 @@ class SlackAlerting:
pass pass
else: else:
print("Error sending slack alert. Error=", response.text) # noqa print("Error sending slack alert. Error=", response.text) # noqa
async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
"""Log deployment latency"""
if "daily_reports" in self.alert_types:
model_id = (
kwargs.get("litellm_params", {}).get("model_info", {}).get("id", "")
)
response_s: timedelta = end_time - start_time
final_value = response_s
total_tokens = 0
if isinstance(response_obj, litellm.ModelResponse):
completion_tokens = response_obj.usage.completion_tokens
final_value = float(response_s.total_seconds() / completion_tokens)
await self.async_update_daily_reports(
DeploymentMetrics(
id=model_id,
failed_request=False,
latency_per_output_token=final_value,
updated_at=litellm.utils.get_utc_datetime(),
)
)
async def async_log_failure_event(self, kwargs, response_obj, start_time, end_time):
"""Log failure + deployment latency"""
if "daily_reports" in self.alert_types:
model_id = (
kwargs.get("litellm_params", {}).get("model_info", {}).get("id", "")
)
await self.async_update_daily_reports(
DeploymentMetrics(
id=model_id,
failed_request=True,
latency_per_output_token=None,
updated_at=litellm.utils.get_utc_datetime(),
)
)
if "llm_exceptions" in self.alert_types:
original_exception = kwargs.get("exception", None)
await self.send_alert(
message="LLM API Failure - " + str(original_exception),
level="High",
alert_type="llm_exceptions",
)
async def _run_scheduler_helper(self, llm_router) -> bool:
"""
Returns:
- True -> report sent
- False -> report not sent
"""
report_sent_bool = False
report_sent = await self.internal_usage_cache.async_get_cache(
key=SlackAlertingCacheKeys.report_sent_key.value
) # None | datetime
current_time = litellm.utils.get_utc_datetime()
if report_sent is None:
_current_time = current_time.isoformat()
await self.internal_usage_cache.async_set_cache(
key=SlackAlertingCacheKeys.report_sent_key.value,
value=_current_time,
)
else:
# check if current time - interval >= time last sent
delta = current_time - timedelta(
seconds=self.alerting_args.daily_report_frequency
)
if isinstance(report_sent, str):
report_sent = dt.fromisoformat(report_sent)
if delta >= report_sent:
# Sneak in the reporting logic here
await self.send_daily_reports(router=llm_router)
# Also, don't forget to update the report_sent time after sending the report!
_current_time = current_time.isoformat()
await self.internal_usage_cache.async_set_cache(
key=SlackAlertingCacheKeys.report_sent_key.value,
value=_current_time,
)
report_sent_bool = True
return report_sent_bool
async def _run_scheduled_daily_report(self, llm_router: Optional[Any] = None):
"""
If 'daily_reports' enabled
Ping redis cache every 5 minutes to check if we should send the report
If yes -> call send_daily_report()
"""
if llm_router is None or self.alert_types is None:
return
if "daily_reports" in self.alert_types:
while True:
await self._run_scheduler_helper(llm_router=llm_router)
interval = random.randint(
self.alerting_args.report_check_interval - 3,
self.alerting_args.report_check_interval + 3,
) # shuffle to prevent collisions
await asyncio.sleep(interval)
return

View file

@ -2,7 +2,7 @@
# On success + failure, log events to Supabase # On success + failure, log events to Supabase
import dotenv, os import dotenv, os
import requests import requests # type: ignore
dotenv.load_dotenv() # Loading env variables using dotenv dotenv.load_dotenv() # Loading env variables using dotenv
import traceback import traceback

View file

@ -1,8 +1,8 @@
import os, types, traceback import os, types, traceback
import json import json
from enum import Enum from enum import Enum
import requests import requests # type: ignore
import time, httpx import time, httpx # type: ignore
from typing import Callable, Optional from typing import Callable, Optional
from litellm.utils import ModelResponse, Choices, Message from litellm.utils import ModelResponse, Choices, Message
import litellm import litellm

View file

@ -1,12 +1,12 @@
import os, types import os, types
import json import json
from enum import Enum from enum import Enum
import requests import requests # type: ignore
import time import time
from typing import Callable, Optional from typing import Callable, Optional
import litellm import litellm
from litellm.utils import ModelResponse, Choices, Message, Usage from litellm.utils import ModelResponse, Choices, Message, Usage
import httpx import httpx # type: ignore
class AlephAlphaError(Exception): class AlephAlphaError(Exception):

View file

@ -1,7 +1,7 @@
import os, types import os, types
import json import json
from enum import Enum from enum import Enum
import requests, copy import requests, copy # type: ignore
import time import time
from typing import Callable, Optional, List from typing import Callable, Optional, List
from litellm.utils import ModelResponse, Usage, map_finish_reason, CustomStreamWrapper from litellm.utils import ModelResponse, Usage, map_finish_reason, CustomStreamWrapper
@ -9,7 +9,7 @@ import litellm
from .prompt_templates.factory import prompt_factory, custom_prompt from .prompt_templates.factory import prompt_factory, custom_prompt
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
from .base import BaseLLM from .base import BaseLLM
import httpx import httpx # type: ignore
class AnthropicConstants(Enum): class AnthropicConstants(Enum):
@ -184,11 +184,6 @@ class AnthropicChatCompletion(BaseLLM):
message=str(completion_response["error"]), message=str(completion_response["error"]),
status_code=response.status_code, status_code=response.status_code,
) )
elif len(completion_response["content"]) == 0:
raise AnthropicError(
message="No content in response",
status_code=500,
)
else: else:
text_content = "" text_content = ""
tool_calls = [] tool_calls = []

View file

@ -1,4 +1,4 @@
from typing import Optional, Union, Any from typing import Optional, Union, Any, Literal
import types, requests import types, requests
from .base import BaseLLM from .base import BaseLLM
from litellm.utils import ( from litellm.utils import (
@ -12,7 +12,7 @@ from litellm.utils import (
from typing import Callable, Optional, BinaryIO from typing import Callable, Optional, BinaryIO
from litellm import OpenAIConfig from litellm import OpenAIConfig
import litellm, json import litellm, json
import httpx import httpx # type: ignore
from .custom_httpx.azure_dall_e_2 import CustomHTTPTransport, AsyncCustomHTTPTransport from .custom_httpx.azure_dall_e_2 import CustomHTTPTransport, AsyncCustomHTTPTransport
from openai import AzureOpenAI, AsyncAzureOpenAI from openai import AzureOpenAI, AsyncAzureOpenAI
import uuid import uuid
@ -952,6 +952,81 @@ class AzureChatCompletion(BaseLLM):
) )
raise e raise e
def get_headers(
self,
model: Optional[str],
api_key: str,
api_base: str,
api_version: str,
timeout: float,
mode: str,
messages: Optional[list] = None,
input: Optional[list] = None,
prompt: Optional[str] = None,
) -> dict:
client_session = litellm.client_session or httpx.Client(
transport=CustomHTTPTransport(), # handle dall-e-2 calls
)
if "gateway.ai.cloudflare.com" in api_base:
## build base url - assume api base includes resource name
if not api_base.endswith("/"):
api_base += "/"
api_base += f"{model}"
client = AzureOpenAI(
base_url=api_base,
api_version=api_version,
api_key=api_key,
timeout=timeout,
http_client=client_session,
)
model = None
# cloudflare ai gateway, needs model=None
else:
client = AzureOpenAI(
api_version=api_version,
azure_endpoint=api_base,
api_key=api_key,
timeout=timeout,
http_client=client_session,
)
# only run this check if it's not cloudflare ai gateway
if model is None and mode != "image_generation":
raise Exception("model is not set")
completion = None
if messages is None:
messages = [{"role": "user", "content": "Hey"}]
try:
completion = client.chat.completions.with_raw_response.create(
model=model, # type: ignore
messages=messages, # type: ignore
)
except Exception as e:
raise e
response = {}
if completion is None or not hasattr(completion, "headers"):
raise Exception("invalid completion response")
if (
completion.headers.get("x-ratelimit-remaining-requests", None) is not None
): # not provided for dall-e requests
response["x-ratelimit-remaining-requests"] = completion.headers[
"x-ratelimit-remaining-requests"
]
if completion.headers.get("x-ratelimit-remaining-tokens", None) is not None:
response["x-ratelimit-remaining-tokens"] = completion.headers[
"x-ratelimit-remaining-tokens"
]
if completion.headers.get("x-ms-region", None) is not None:
response["x-ms-region"] = completion.headers["x-ms-region"]
return response
async def ahealth_check( async def ahealth_check(
self, self,
model: Optional[str], model: Optional[str],
@ -963,7 +1038,7 @@ class AzureChatCompletion(BaseLLM):
messages: Optional[list] = None, messages: Optional[list] = None,
input: Optional[list] = None, input: Optional[list] = None,
prompt: Optional[str] = None, prompt: Optional[str] = None,
): ) -> dict:
client_session = litellm.aclient_session or httpx.AsyncClient( client_session = litellm.aclient_session or httpx.AsyncClient(
transport=AsyncCustomHTTPTransport(), # handle dall-e-2 calls transport=AsyncCustomHTTPTransport(), # handle dall-e-2 calls
) )
@ -1040,4 +1115,8 @@ class AzureChatCompletion(BaseLLM):
response["x-ratelimit-remaining-tokens"] = completion.headers[ response["x-ratelimit-remaining-tokens"] = completion.headers[
"x-ratelimit-remaining-tokens" "x-ratelimit-remaining-tokens"
] ]
if completion.headers.get("x-ms-region", None) is not None:
response["x-ms-region"] = completion.headers["x-ms-region"]
return response return response

View file

@ -1,5 +1,5 @@
from typing import Optional, Union, Any from typing import Optional, Union, Any
import types, requests import types, requests # type: ignore
from .base import BaseLLM from .base import BaseLLM
from litellm.utils import ( from litellm.utils import (
ModelResponse, ModelResponse,

View file

@ -1,7 +1,7 @@
import os import os
import json import json
from enum import Enum from enum import Enum
import requests import requests # type: ignore
import time import time
from typing import Callable from typing import Callable
from litellm.utils import ModelResponse, Usage from litellm.utils import ModelResponse, Usage

View file

@ -163,10 +163,9 @@ class AmazonAnthropicClaude3Config:
"stop", "stop",
"temperature", "temperature",
"top_p", "top_p",
"extra_headers" "extra_headers",
] ]
def map_openai_params(self, non_default_params: dict, optional_params: dict): def map_openai_params(self, non_default_params: dict, optional_params: dict):
for param, value in non_default_params.items(): for param, value in non_default_params.items():
if param == "max_tokens": if param == "max_tokens":
@ -534,10 +533,12 @@ class AmazonStabilityConfig:
def add_custom_header(headers): def add_custom_header(headers):
"""Closure to capture the headers and add them.""" """Closure to capture the headers and add them."""
def callback(request, **kwargs): def callback(request, **kwargs):
"""Actual callback function that Boto3 will call.""" """Actual callback function that Boto3 will call."""
for header_name, header_value in headers.items(): for header_name, header_value in headers.items():
request.headers.add_header(header_name, header_value) request.headers.add_header(header_name, header_value)
return callback return callback
@ -672,7 +673,9 @@ def init_bedrock_client(
config=config, config=config,
) )
if extra_headers: if extra_headers:
client.meta.events.register('before-sign.bedrock-runtime.*', add_custom_header(extra_headers)) client.meta.events.register(
"before-sign.bedrock-runtime.*", add_custom_header(extra_headers)
)
return client return client
@ -1224,7 +1227,7 @@ def _embedding_func_single(
"input_type", "search_document" "input_type", "search_document"
) # aws bedrock example default - https://us-east-1.console.aws.amazon.com/bedrock/home?region=us-east-1#/providers?model=cohere.embed-english-v3 ) # aws bedrock example default - https://us-east-1.console.aws.amazon.com/bedrock/home?region=us-east-1#/providers?model=cohere.embed-english-v3
data = {"texts": [input], **inference_params} # type: ignore data = {"texts": [input], **inference_params} # type: ignore
body = json.dumps(data).encode("utf-8") body = json.dumps(data).encode("utf-8") # type: ignore
## LOGGING ## LOGGING
request_str = f""" request_str = f"""
response = client.invoke_model( response = client.invoke_model(
@ -1416,7 +1419,7 @@ def image_generation(
## LOGGING ## LOGGING
request_str = f""" request_str = f"""
response = client.invoke_model( response = client.invoke_model(
body={body}, body={body}, # type: ignore
modelId={modelId}, modelId={modelId},
accept="application/json", accept="application/json",
contentType="application/json", contentType="application/json",

View file

@ -1,11 +1,11 @@
import os, types import os, types
import json import json
from enum import Enum from enum import Enum
import requests import requests # type: ignore
import time import time
from typing import Callable, Optional from typing import Callable, Optional
import litellm import litellm
import httpx import httpx # type: ignore
from litellm.utils import ModelResponse, Usage from litellm.utils import ModelResponse, Usage
from .prompt_templates.factory import prompt_factory, custom_prompt from .prompt_templates.factory import prompt_factory, custom_prompt

View file

@ -1,12 +1,12 @@
import os, types import os, types
import json import json
from enum import Enum from enum import Enum
import requests import requests # type: ignore
import time, traceback import time, traceback
from typing import Callable, Optional from typing import Callable, Optional
from litellm.utils import ModelResponse, Choices, Message, Usage from litellm.utils import ModelResponse, Choices, Message, Usage
import litellm import litellm
import httpx import httpx # type: ignore
class CohereError(Exception): class CohereError(Exception):

View file

@ -1,12 +1,12 @@
import os, types import os, types
import json import json
from enum import Enum from enum import Enum
import requests import requests # type: ignore
import time, traceback import time, traceback
from typing import Callable, Optional from typing import Callable, Optional
from litellm.utils import ModelResponse, Choices, Message, Usage from litellm.utils import ModelResponse, Choices, Message, Usage
import litellm import litellm
import httpx import httpx # type: ignore
from .prompt_templates.factory import cohere_message_pt from .prompt_templates.factory import cohere_message_pt

View file

@ -1,7 +1,7 @@
import os, types import os, types
import json import json
from enum import Enum from enum import Enum
import requests import requests # type: ignore
import time, traceback import time, traceback
from typing import Callable, Optional, List from typing import Callable, Optional, List
from litellm.utils import ModelResponse, Choices, Message, Usage from litellm.utils import ModelResponse, Choices, Message, Usage

View file

@ -1,7 +1,7 @@
import os, types import os, types
import json import json
from enum import Enum from enum import Enum
import requests import requests # type: ignore
import time import time
from typing import Callable, Optional from typing import Callable, Optional
import litellm import litellm

View file

@ -1,10 +1,10 @@
from itertools import chain from itertools import chain
import requests, types, time import requests, types, time # type: ignore
import json, uuid import json, uuid
import traceback import traceback
from typing import Optional from typing import Optional
import litellm import litellm
import httpx, aiohttp, asyncio import httpx, aiohttp, asyncio # type: ignore
from .prompt_templates.factory import prompt_factory, custom_prompt from .prompt_templates.factory import prompt_factory, custom_prompt
@ -220,7 +220,10 @@ def get_ollama_response(
tool_calls=[ tool_calls=[
{ {
"id": f"call_{str(uuid.uuid4())}", "id": f"call_{str(uuid.uuid4())}",
"function": {"name": function_call["name"], "arguments": json.dumps(function_call["arguments"])}, "function": {
"name": function_call["name"],
"arguments": json.dumps(function_call["arguments"]),
},
"type": "function", "type": "function",
} }
], ],
@ -232,7 +235,9 @@ def get_ollama_response(
model_response["created"] = int(time.time()) model_response["created"] = int(time.time())
model_response["model"] = "ollama/" + model model_response["model"] = "ollama/" + model
prompt_tokens = response_json.get("prompt_eval_count", len(encoding.encode(prompt, disallowed_special=()))) # type: ignore prompt_tokens = response_json.get("prompt_eval_count", len(encoding.encode(prompt, disallowed_special=()))) # type: ignore
completion_tokens = response_json.get("eval_count", len(response_json.get("message",dict()).get("content", ""))) completion_tokens = response_json.get(
"eval_count", len(response_json.get("message", dict()).get("content", ""))
)
model_response["usage"] = litellm.Usage( model_response["usage"] = litellm.Usage(
prompt_tokens=prompt_tokens, prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens, completion_tokens=completion_tokens,
@ -273,7 +278,10 @@ def ollama_completion_stream(url, data, logging_obj):
tool_calls=[ tool_calls=[
{ {
"id": f"call_{str(uuid.uuid4())}", "id": f"call_{str(uuid.uuid4())}",
"function": {"name": function_call["name"], "arguments": json.dumps(function_call["arguments"])}, "function": {
"name": function_call["name"],
"arguments": json.dumps(function_call["arguments"]),
},
"type": "function", "type": "function",
} }
], ],
@ -316,7 +324,8 @@ async def ollama_async_streaming(url, data, model_response, encoding, logging_ob
[ [
chunk.choices[0].delta.content chunk.choices[0].delta.content
async for chunk in streamwrapper async for chunk in streamwrapper
if chunk.choices[0].delta.content] if chunk.choices[0].delta.content
]
) )
function_call = json.loads(response_content) function_call = json.loads(response_content)
delta = litellm.utils.Delta( delta = litellm.utils.Delta(
@ -324,7 +333,10 @@ async def ollama_async_streaming(url, data, model_response, encoding, logging_ob
tool_calls=[ tool_calls=[
{ {
"id": f"call_{str(uuid.uuid4())}", "id": f"call_{str(uuid.uuid4())}",
"function": {"name": function_call["name"], "arguments": json.dumps(function_call["arguments"])}, "function": {
"name": function_call["name"],
"arguments": json.dumps(function_call["arguments"]),
},
"type": "function", "type": "function",
} }
], ],
@ -373,7 +385,10 @@ async def ollama_acompletion(url, data, model_response, encoding, logging_obj):
tool_calls=[ tool_calls=[
{ {
"id": f"call_{str(uuid.uuid4())}", "id": f"call_{str(uuid.uuid4())}",
"function": {"name": function_call["name"], "arguments": json.dumps(function_call["arguments"])}, "function": {
"name": function_call["name"],
"arguments": json.dumps(function_call["arguments"]),
},
"type": "function", "type": "function",
} }
], ],
@ -387,7 +402,10 @@ async def ollama_acompletion(url, data, model_response, encoding, logging_obj):
model_response["created"] = int(time.time()) model_response["created"] = int(time.time())
model_response["model"] = "ollama/" + data["model"] model_response["model"] = "ollama/" + data["model"]
prompt_tokens = response_json.get("prompt_eval_count", len(encoding.encode(data["prompt"], disallowed_special=()))) # type: ignore prompt_tokens = response_json.get("prompt_eval_count", len(encoding.encode(data["prompt"], disallowed_special=()))) # type: ignore
completion_tokens = response_json.get("eval_count", len(response_json.get("message",dict()).get("content", ""))) completion_tokens = response_json.get(
"eval_count",
len(response_json.get("message", dict()).get("content", "")),
)
model_response["usage"] = litellm.Usage( model_response["usage"] = litellm.Usage(
prompt_tokens=prompt_tokens, prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens, completion_tokens=completion_tokens,
@ -474,3 +492,25 @@ async def ollama_aembeddings(
"total_tokens": total_input_tokens, "total_tokens": total_input_tokens,
} }
return model_response return model_response
def ollama_embeddings(
api_base: str,
model: str,
prompts: list,
optional_params=None,
logging_obj=None,
model_response=None,
encoding=None,
):
return asyncio.run(
ollama_aembeddings(
api_base,
model,
prompts,
optional_params,
logging_obj,
model_response,
encoding,
)
)

View file

@ -1,7 +1,7 @@
import os import os
import json import json
from enum import Enum from enum import Enum
import requests import requests # type: ignore
import time import time
from typing import Callable, Optional from typing import Callable, Optional
from litellm.utils import ModelResponse, Usage from litellm.utils import ModelResponse, Usage

View file

@ -22,7 +22,6 @@ from litellm.utils import (
TextCompletionResponse, TextCompletionResponse,
) )
from typing import Callable, Optional from typing import Callable, Optional
import aiohttp, requests
import litellm import litellm
from .prompt_templates.factory import prompt_factory, custom_prompt from .prompt_templates.factory import prompt_factory, custom_prompt
from openai import OpenAI, AsyncOpenAI from openai import OpenAI, AsyncOpenAI
@ -531,6 +530,7 @@ class OpenAIChatCompletion(BaseLLM):
model=model, model=model,
custom_llm_provider="openai", custom_llm_provider="openai",
logging_obj=logging_obj, logging_obj=logging_obj,
stream_options=data.get("stream_options", None),
) )
return streamwrapper return streamwrapper
@ -580,6 +580,7 @@ class OpenAIChatCompletion(BaseLLM):
model=model, model=model,
custom_llm_provider="openai", custom_llm_provider="openai",
logging_obj=logging_obj, logging_obj=logging_obj,
stream_options=data.get("stream_options", None),
) )
return streamwrapper return streamwrapper
except ( except (

View file

@ -1,7 +1,7 @@
import os, types import os, types
import json import json
from enum import Enum from enum import Enum
import requests import requests # type: ignore
import time import time
from typing import Callable, Optional from typing import Callable, Optional
import litellm import litellm

View file

@ -981,7 +981,7 @@ def anthropic_messages_pt(messages: list):
# add role=tool support to allow function call result/error submission # add role=tool support to allow function call result/error submission
user_message_types = {"user", "tool", "function"} user_message_types = {"user", "tool", "function"}
# reformat messages to ensure user/assistant are alternating, if there's either 2 consecutive 'user' messages or 2 consecutive 'assistant' message, merge them. # reformat messages to ensure user/assistant are alternating, if there's either 2 consecutive 'user' messages or 2 consecutive 'assistant' message, merge them.
new_messages = [] new_messages: list = []
msg_i = 0 msg_i = 0
tool_use_param = False tool_use_param = False
while msg_i < len(messages): while msg_i < len(messages):

View file

@ -1,11 +1,11 @@
import os, types import os, types
import json import json
import requests import requests # type: ignore
import time import time
from typing import Callable, Optional from typing import Callable, Optional
from litellm.utils import ModelResponse, Usage from litellm.utils import ModelResponse, Usage
import litellm import litellm
import httpx import httpx # type: ignore
from .prompt_templates.factory import prompt_factory, custom_prompt from .prompt_templates.factory import prompt_factory, custom_prompt

View file

@ -1,14 +1,14 @@
import os, types, traceback import os, types, traceback
from enum import Enum from enum import Enum
import json import json
import requests import requests # type: ignore
import time import time
from typing import Callable, Optional, Any from typing import Callable, Optional, Any
import litellm import litellm
from litellm.utils import ModelResponse, EmbeddingResponse, get_secret, Usage from litellm.utils import ModelResponse, EmbeddingResponse, get_secret, Usage
import sys import sys
from copy import deepcopy from copy import deepcopy
import httpx import httpx # type: ignore
from .prompt_templates.factory import prompt_factory, custom_prompt from .prompt_templates.factory import prompt_factory, custom_prompt
@ -295,7 +295,7 @@ def completion(
EndpointName={model}, EndpointName={model},
InferenceComponentName={model_id}, InferenceComponentName={model_id},
ContentType="application/json", ContentType="application/json",
Body={data}, Body={data}, # type: ignore
CustomAttributes="accept_eula=true", CustomAttributes="accept_eula=true",
) )
""" # type: ignore """ # type: ignore
@ -321,7 +321,7 @@ def completion(
response = client.invoke_endpoint( response = client.invoke_endpoint(
EndpointName={model}, EndpointName={model},
ContentType="application/json", ContentType="application/json",
Body={data}, Body={data}, # type: ignore
CustomAttributes="accept_eula=true", CustomAttributes="accept_eula=true",
) )
""" # type: ignore """ # type: ignore
@ -688,7 +688,7 @@ def embedding(
response = client.invoke_endpoint( response = client.invoke_endpoint(
EndpointName={model}, EndpointName={model},
ContentType="application/json", ContentType="application/json",
Body={data}, Body={data}, # type: ignore
CustomAttributes="accept_eula=true", CustomAttributes="accept_eula=true",
)""" # type: ignore )""" # type: ignore
logging_obj.pre_call( logging_obj.pre_call(

View file

@ -6,11 +6,11 @@ Reference: https://docs.together.ai/docs/openai-api-compatibility
import os, types import os, types
import json import json
from enum import Enum from enum import Enum
import requests import requests # type: ignore
import time import time
from typing import Callable, Optional from typing import Callable, Optional
import litellm import litellm
import httpx import httpx # type: ignore
from litellm.utils import ModelResponse, Usage from litellm.utils import ModelResponse, Usage
from .prompt_templates.factory import prompt_factory, custom_prompt from .prompt_templates.factory import prompt_factory, custom_prompt

View file

@ -1,12 +1,12 @@
import os, types import os, types
import json import json
from enum import Enum from enum import Enum
import requests import requests # type: ignore
import time import time
from typing import Callable, Optional, Union, List from typing import Callable, Optional, Union, List
from litellm.utils import ModelResponse, Usage, CustomStreamWrapper, map_finish_reason from litellm.utils import ModelResponse, Usage, CustomStreamWrapper, map_finish_reason
import litellm, uuid import litellm, uuid
import httpx, inspect import httpx, inspect # type: ignore
class VertexAIError(Exception): class VertexAIError(Exception):

View file

@ -3,7 +3,7 @@
import os, types import os, types
import json import json
from enum import Enum from enum import Enum
import requests, copy import requests, copy # type: ignore
import time, uuid import time, uuid
from typing import Callable, Optional, List from typing import Callable, Optional, List
from litellm.utils import ModelResponse, Usage, map_finish_reason, CustomStreamWrapper from litellm.utils import ModelResponse, Usage, map_finish_reason, CustomStreamWrapper
@ -17,7 +17,7 @@ from .prompt_templates.factory import (
extract_between_tags, extract_between_tags,
parse_xml_params, parse_xml_params,
) )
import httpx import httpx # type: ignore
class VertexAIError(Exception): class VertexAIError(Exception):

View file

@ -1,8 +1,8 @@
import os import os
import json import json
from enum import Enum from enum import Enum
import requests import requests # type: ignore
import time, httpx import time, httpx # type: ignore
from typing import Callable, Any from typing import Callable, Any
from litellm.utils import ModelResponse, Usage from litellm.utils import ModelResponse, Usage
from .prompt_templates.factory import prompt_factory, custom_prompt from .prompt_templates.factory import prompt_factory, custom_prompt

View file

@ -3,8 +3,8 @@ import json, types, time # noqa: E401
from contextlib import contextmanager from contextlib import contextmanager
from typing import Callable, Dict, Optional, Any, Union, List from typing import Callable, Dict, Optional, Any, Union, List
import httpx import httpx # type: ignore
import requests import requests # type: ignore
import litellm import litellm
from litellm.utils import ModelResponse, get_secret, Usage from litellm.utils import ModelResponse, get_secret, Usage

View file

@ -12,9 +12,9 @@ from typing import Any, Literal, Union, BinaryIO
from functools import partial from functools import partial
import dotenv, traceback, random, asyncio, time, contextvars import dotenv, traceback, random, asyncio, time, contextvars
from copy import deepcopy from copy import deepcopy
import httpx import httpx
import litellm import litellm
from ._logging import verbose_logger from ._logging import verbose_logger
from litellm import ( # type: ignore from litellm import ( # type: ignore
client, client,
@ -188,6 +188,7 @@ async def acompletion(
top_p: Optional[float] = None, top_p: Optional[float] = None,
n: Optional[int] = None, n: Optional[int] = None,
stream: Optional[bool] = None, stream: Optional[bool] = None,
stream_options: Optional[dict] = None,
stop=None, stop=None,
max_tokens: Optional[int] = None, max_tokens: Optional[int] = None,
presence_penalty: Optional[float] = None, presence_penalty: Optional[float] = None,
@ -207,6 +208,7 @@ async def acompletion(
api_version: Optional[str] = None, api_version: Optional[str] = None,
api_key: Optional[str] = None, api_key: Optional[str] = None,
model_list: Optional[list] = None, # pass in a list of api_base,keys, etc. model_list: Optional[list] = None, # pass in a list of api_base,keys, etc.
extra_headers: Optional[dict] = None,
# Optional liteLLM function params # Optional liteLLM function params
**kwargs, **kwargs,
): ):
@ -224,6 +226,7 @@ async def acompletion(
top_p (float, optional): The top-p parameter for nucleus sampling (default is 1.0). top_p (float, optional): The top-p parameter for nucleus sampling (default is 1.0).
n (int, optional): The number of completions to generate (default is 1). n (int, optional): The number of completions to generate (default is 1).
stream (bool, optional): If True, return a streaming response (default is False). stream (bool, optional): If True, return a streaming response (default is False).
stream_options (dict, optional): A dictionary containing options for the streaming response. Only use this if stream is True.
stop(string/list, optional): - Up to 4 sequences where the LLM API will stop generating further tokens. stop(string/list, optional): - Up to 4 sequences where the LLM API will stop generating further tokens.
max_tokens (integer, optional): The maximum number of tokens in the generated completion (default is infinity). max_tokens (integer, optional): The maximum number of tokens in the generated completion (default is infinity).
presence_penalty (float, optional): It is used to penalize new tokens based on their existence in the text so far. presence_penalty (float, optional): It is used to penalize new tokens based on their existence in the text so far.
@ -261,6 +264,7 @@ async def acompletion(
"top_p": top_p, "top_p": top_p,
"n": n, "n": n,
"stream": stream, "stream": stream,
"stream_options": stream_options,
"stop": stop, "stop": stop,
"max_tokens": max_tokens, "max_tokens": max_tokens,
"presence_penalty": presence_penalty, "presence_penalty": presence_penalty,
@ -305,6 +309,7 @@ async def acompletion(
or custom_llm_provider == "deepinfra" or custom_llm_provider == "deepinfra"
or custom_llm_provider == "perplexity" or custom_llm_provider == "perplexity"
or custom_llm_provider == "groq" or custom_llm_provider == "groq"
or custom_llm_provider == "deepseek"
or custom_llm_provider == "text-completion-openai" or custom_llm_provider == "text-completion-openai"
or custom_llm_provider == "huggingface" or custom_llm_provider == "huggingface"
or custom_llm_provider == "ollama" or custom_llm_provider == "ollama"
@ -457,6 +462,7 @@ def completion(
top_p: Optional[float] = None, top_p: Optional[float] = None,
n: Optional[int] = None, n: Optional[int] = None,
stream: Optional[bool] = None, stream: Optional[bool] = None,
stream_options: Optional[dict] = None,
stop=None, stop=None,
max_tokens: Optional[int] = None, max_tokens: Optional[int] = None,
presence_penalty: Optional[float] = None, presence_penalty: Optional[float] = None,
@ -496,6 +502,7 @@ def completion(
top_p (float, optional): The top-p parameter for nucleus sampling (default is 1.0). top_p (float, optional): The top-p parameter for nucleus sampling (default is 1.0).
n (int, optional): The number of completions to generate (default is 1). n (int, optional): The number of completions to generate (default is 1).
stream (bool, optional): If True, return a streaming response (default is False). stream (bool, optional): If True, return a streaming response (default is False).
stream_options (dict, optional): A dictionary containing options for the streaming response. Only set this when you set stream: true.
stop(string/list, optional): - Up to 4 sequences where the LLM API will stop generating further tokens. stop(string/list, optional): - Up to 4 sequences where the LLM API will stop generating further tokens.
max_tokens (integer, optional): The maximum number of tokens in the generated completion (default is infinity). max_tokens (integer, optional): The maximum number of tokens in the generated completion (default is infinity).
presence_penalty (float, optional): It is used to penalize new tokens based on their existence in the text so far. presence_penalty (float, optional): It is used to penalize new tokens based on their existence in the text so far.
@ -573,6 +580,7 @@ def completion(
"top_p", "top_p",
"n", "n",
"stream", "stream",
"stream_options",
"stop", "stop",
"max_tokens", "max_tokens",
"presence_penalty", "presence_penalty",
@ -648,6 +656,8 @@ def completion(
"base_model", "base_model",
"stream_timeout", "stream_timeout",
"supports_system_message", "supports_system_message",
"region_name",
"allowed_model_region",
] ]
default_params = openai_params + litellm_params default_params = openai_params + litellm_params
non_default_params = { non_default_params = {
@ -783,6 +793,7 @@ def completion(
top_p=top_p, top_p=top_p,
n=n, n=n,
stream=stream, stream=stream,
stream_options=stream_options,
stop=stop, stop=stop,
max_tokens=max_tokens, max_tokens=max_tokens,
presence_penalty=presence_penalty, presence_penalty=presence_penalty,
@ -982,6 +993,7 @@ def completion(
or custom_llm_provider == "deepinfra" or custom_llm_provider == "deepinfra"
or custom_llm_provider == "perplexity" or custom_llm_provider == "perplexity"
or custom_llm_provider == "groq" or custom_llm_provider == "groq"
or custom_llm_provider == "deepseek"
or custom_llm_provider == "anyscale" or custom_llm_provider == "anyscale"
or custom_llm_provider == "mistral" or custom_llm_provider == "mistral"
or custom_llm_provider == "openai" or custom_llm_provider == "openai"
@ -2565,6 +2577,7 @@ async def aembedding(*args, **kwargs):
or custom_llm_provider == "deepinfra" or custom_llm_provider == "deepinfra"
or custom_llm_provider == "perplexity" or custom_llm_provider == "perplexity"
or custom_llm_provider == "groq" or custom_llm_provider == "groq"
or custom_llm_provider == "deepseek"
or custom_llm_provider == "fireworks_ai" or custom_llm_provider == "fireworks_ai"
or custom_llm_provider == "ollama" or custom_llm_provider == "ollama"
or custom_llm_provider == "vertex_ai" or custom_llm_provider == "vertex_ai"
@ -2714,6 +2727,8 @@ def embedding(
"ttl", "ttl",
"cache", "cache",
"no-log", "no-log",
"region_name",
"allowed_model_region",
] ]
default_params = openai_params + litellm_params default_params = openai_params + litellm_params
non_default_params = { non_default_params = {
@ -2947,8 +2962,10 @@ def embedding(
model=model, # type: ignore model=model, # type: ignore
llm_provider="ollama", # type: ignore llm_provider="ollama", # type: ignore
) )
if aembedding: ollama_embeddings_fn = (
response = ollama.ollama_aembeddings( ollama.ollama_aembeddings if aembedding else ollama.ollama_embeddings
)
response = ollama_embeddings_fn(
api_base=api_base, api_base=api_base,
model=model, model=model,
prompts=input, prompts=input,
@ -3085,11 +3102,13 @@ async def atext_completion(*args, **kwargs):
or custom_llm_provider == "deepinfra" or custom_llm_provider == "deepinfra"
or custom_llm_provider == "perplexity" or custom_llm_provider == "perplexity"
or custom_llm_provider == "groq" or custom_llm_provider == "groq"
or custom_llm_provider == "deepseek"
or custom_llm_provider == "fireworks_ai" or custom_llm_provider == "fireworks_ai"
or custom_llm_provider == "text-completion-openai" or custom_llm_provider == "text-completion-openai"
or custom_llm_provider == "huggingface" or custom_llm_provider == "huggingface"
or custom_llm_provider == "ollama" or custom_llm_provider == "ollama"
or custom_llm_provider == "vertex_ai" or custom_llm_provider == "vertex_ai"
or custom_llm_provider in litellm.openai_compatible_providers
): # currently implemented aiohttp calls for just azure and openai, soon all. ): # currently implemented aiohttp calls for just azure and openai, soon all.
# Await normally # Await normally
response = await loop.run_in_executor(None, func_with_context) response = await loop.run_in_executor(None, func_with_context)
@ -3120,6 +3139,8 @@ async def atext_completion(*args, **kwargs):
## TRANSLATE CHAT TO TEXT FORMAT ## ## TRANSLATE CHAT TO TEXT FORMAT ##
if isinstance(response, TextCompletionResponse): if isinstance(response, TextCompletionResponse):
return response return response
elif asyncio.iscoroutine(response):
response = await response
text_completion_response = TextCompletionResponse() text_completion_response = TextCompletionResponse()
text_completion_response["id"] = response.get("id", None) text_completion_response["id"] = response.get("id", None)
@ -3581,6 +3602,8 @@ def image_generation(
"caching_groups", "caching_groups",
"ttl", "ttl",
"cache", "cache",
"region_name",
"allowed_model_region",
] ]
default_params = openai_params + litellm_params default_params = openai_params + litellm_params
non_default_params = { non_default_params = {

View file

@ -739,6 +739,24 @@
"litellm_provider": "mistral", "litellm_provider": "mistral",
"mode": "embedding" "mode": "embedding"
}, },
"deepseek-chat": {
"max_tokens": 4096,
"max_input_tokens": 32000,
"max_output_tokens": 4096,
"input_cost_per_token": 0.00000014,
"output_cost_per_token": 0.00000028,
"litellm_provider": "deepseek",
"mode": "chat"
},
"deepseek-coder": {
"max_tokens": 4096,
"max_input_tokens": 16000,
"max_output_tokens": 4096,
"input_cost_per_token": 0.00000014,
"output_cost_per_token": 0.00000028,
"litellm_provider": "deepseek",
"mode": "chat"
},
"groq/llama2-70b-4096": { "groq/llama2-70b-4096": {
"max_tokens": 4096, "max_tokens": 4096,
"max_input_tokens": 4096, "max_input_tokens": 4096,
@ -1060,8 +1078,8 @@
"max_tokens": 8192, "max_tokens": 8192,
"max_input_tokens": 1000000, "max_input_tokens": 1000000,
"max_output_tokens": 8192, "max_output_tokens": 8192,
"input_cost_per_token": 0, "input_cost_per_token": 0.000000625,
"output_cost_per_token": 0, "output_cost_per_token": 0.000001875,
"litellm_provider": "vertex_ai-language-models", "litellm_provider": "vertex_ai-language-models",
"mode": "chat", "mode": "chat",
"supports_function_calling": true, "supports_function_calling": true,
@ -1072,8 +1090,8 @@
"max_tokens": 8192, "max_tokens": 8192,
"max_input_tokens": 1000000, "max_input_tokens": 1000000,
"max_output_tokens": 8192, "max_output_tokens": 8192,
"input_cost_per_token": 0, "input_cost_per_token": 0.000000625,
"output_cost_per_token": 0, "output_cost_per_token": 0.000001875,
"litellm_provider": "vertex_ai-language-models", "litellm_provider": "vertex_ai-language-models",
"mode": "chat", "mode": "chat",
"supports_function_calling": true, "supports_function_calling": true,
@ -1084,8 +1102,8 @@
"max_tokens": 8192, "max_tokens": 8192,
"max_input_tokens": 1000000, "max_input_tokens": 1000000,
"max_output_tokens": 8192, "max_output_tokens": 8192,
"input_cost_per_token": 0, "input_cost_per_token": 0.000000625,
"output_cost_per_token": 0, "output_cost_per_token": 0.000001875,
"litellm_provider": "vertex_ai-language-models", "litellm_provider": "vertex_ai-language-models",
"mode": "chat", "mode": "chat",
"supports_function_calling": true, "supports_function_calling": true,

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View file

@ -1 +1 @@
!function(){"use strict";var e,t,n,r,o,u,i,c,f,a={},l={};function d(e){var t=l[e];if(void 0!==t)return t.exports;var n=l[e]={id:e,loaded:!1,exports:{}},r=!0;try{a[e](n,n.exports,d),r=!1}finally{r&&delete l[e]}return n.loaded=!0,n.exports}d.m=a,e=[],d.O=function(t,n,r,o){if(n){o=o||0;for(var u=e.length;u>0&&e[u-1][2]>o;u--)e[u]=e[u-1];e[u]=[n,r,o];return}for(var i=1/0,u=0;u<e.length;u++){for(var n=e[u][0],r=e[u][1],o=e[u][2],c=!0,f=0;f<n.length;f++)i>=o&&Object.keys(d.O).every(function(e){return d.O[e](n[f])})?n.splice(f--,1):(c=!1,o<i&&(i=o));if(c){e.splice(u--,1);var a=r();void 0!==a&&(t=a)}}return t},d.n=function(e){var t=e&&e.__esModule?function(){return e.default}:function(){return e};return d.d(t,{a:t}),t},n=Object.getPrototypeOf?function(e){return Object.getPrototypeOf(e)}:function(e){return e.__proto__},d.t=function(e,r){if(1&r&&(e=this(e)),8&r||"object"==typeof e&&e&&(4&r&&e.__esModule||16&r&&"function"==typeof e.then))return e;var o=Object.create(null);d.r(o);var u={};t=t||[null,n({}),n([]),n(n)];for(var i=2&r&&e;"object"==typeof i&&!~t.indexOf(i);i=n(i))Object.getOwnPropertyNames(i).forEach(function(t){u[t]=function(){return e[t]}});return u.default=function(){return e},d.d(o,u),o},d.d=function(e,t){for(var n in t)d.o(t,n)&&!d.o(e,n)&&Object.defineProperty(e,n,{enumerable:!0,get:t[n]})},d.f={},d.e=function(e){return Promise.all(Object.keys(d.f).reduce(function(t,n){return d.f[n](e,t),t},[]))},d.u=function(e){},d.miniCssF=function(e){return"static/css/00c2ddbcd01819c0.css"},d.g=function(){if("object"==typeof globalThis)return globalThis;try{return this||Function("return this")()}catch(e){if("object"==typeof window)return window}}(),d.o=function(e,t){return Object.prototype.hasOwnProperty.call(e,t)},r={},o="_N_E:",d.l=function(e,t,n,u){if(r[e]){r[e].push(t);return}if(void 0!==n)for(var i,c,f=document.getElementsByTagName("script"),a=0;a<f.length;a++){var l=f[a];if(l.getAttribute("src")==e||l.getAttribute("data-webpack")==o+n){i=l;break}}i||(c=!0,(i=document.createElement("script")).charset="utf-8",i.timeout=120,d.nc&&i.setAttribute("nonce",d.nc),i.setAttribute("data-webpack",o+n),i.src=d.tu(e)),r[e]=[t];var s=function(t,n){i.onerror=i.onload=null,clearTimeout(p);var o=r[e];if(delete r[e],i.parentNode&&i.parentNode.removeChild(i),o&&o.forEach(function(e){return e(n)}),t)return t(n)},p=setTimeout(s.bind(null,void 0,{type:"timeout",target:i}),12e4);i.onerror=s.bind(null,i.onerror),i.onload=s.bind(null,i.onload),c&&document.head.appendChild(i)},d.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},d.nmd=function(e){return e.paths=[],e.children||(e.children=[]),e},d.tt=function(){return void 0===u&&(u={createScriptURL:function(e){return e}},"undefined"!=typeof trustedTypes&&trustedTypes.createPolicy&&(u=trustedTypes.createPolicy("nextjs#bundler",u))),u},d.tu=function(e){return d.tt().createScriptURL(e)},d.p="/ui/_next/",i={272:0},d.f.j=function(e,t){var n=d.o(i,e)?i[e]:void 0;if(0!==n){if(n)t.push(n[2]);else if(272!=e){var r=new Promise(function(t,r){n=i[e]=[t,r]});t.push(n[2]=r);var o=d.p+d.u(e),u=Error();d.l(o,function(t){if(d.o(i,e)&&(0!==(n=i[e])&&(i[e]=void 0),n)){var r=t&&("load"===t.type?"missing":t.type),o=t&&t.target&&t.target.src;u.message="Loading chunk "+e+" failed.\n("+r+": "+o+")",u.name="ChunkLoadError",u.type=r,u.request=o,n[1](u)}},"chunk-"+e,e)}else i[e]=0}},d.O.j=function(e){return 0===i[e]},c=function(e,t){var n,r,o=t[0],u=t[1],c=t[2],f=0;if(o.some(function(e){return 0!==i[e]})){for(n in u)d.o(u,n)&&(d.m[n]=u[n]);if(c)var a=c(d)}for(e&&e(t);f<o.length;f++)r=o[f],d.o(i,r)&&i[r]&&i[r][0](),i[r]=0;return d.O(a)},(f=self.webpackChunk_N_E=self.webpackChunk_N_E||[]).forEach(c.bind(null,0)),f.push=c.bind(null,f.push.bind(f))}(); !function(){"use strict";var e,t,n,r,o,u,i,c,f,a={},l={};function d(e){var t=l[e];if(void 0!==t)return t.exports;var n=l[e]={id:e,loaded:!1,exports:{}},r=!0;try{a[e](n,n.exports,d),r=!1}finally{r&&delete l[e]}return n.loaded=!0,n.exports}d.m=a,e=[],d.O=function(t,n,r,o){if(n){o=o||0;for(var u=e.length;u>0&&e[u-1][2]>o;u--)e[u]=e[u-1];e[u]=[n,r,o];return}for(var i=1/0,u=0;u<e.length;u++){for(var n=e[u][0],r=e[u][1],o=e[u][2],c=!0,f=0;f<n.length;f++)i>=o&&Object.keys(d.O).every(function(e){return d.O[e](n[f])})?n.splice(f--,1):(c=!1,o<i&&(i=o));if(c){e.splice(u--,1);var a=r();void 0!==a&&(t=a)}}return t},d.n=function(e){var t=e&&e.__esModule?function(){return e.default}:function(){return e};return d.d(t,{a:t}),t},n=Object.getPrototypeOf?function(e){return Object.getPrototypeOf(e)}:function(e){return e.__proto__},d.t=function(e,r){if(1&r&&(e=this(e)),8&r||"object"==typeof e&&e&&(4&r&&e.__esModule||16&r&&"function"==typeof e.then))return e;var o=Object.create(null);d.r(o);var u={};t=t||[null,n({}),n([]),n(n)];for(var i=2&r&&e;"object"==typeof i&&!~t.indexOf(i);i=n(i))Object.getOwnPropertyNames(i).forEach(function(t){u[t]=function(){return e[t]}});return u.default=function(){return e},d.d(o,u),o},d.d=function(e,t){for(var n in t)d.o(t,n)&&!d.o(e,n)&&Object.defineProperty(e,n,{enumerable:!0,get:t[n]})},d.f={},d.e=function(e){return Promise.all(Object.keys(d.f).reduce(function(t,n){return d.f[n](e,t),t},[]))},d.u=function(e){},d.miniCssF=function(e){return"static/css/a1602eb39f799143.css"},d.g=function(){if("object"==typeof globalThis)return globalThis;try{return this||Function("return this")()}catch(e){if("object"==typeof window)return window}}(),d.o=function(e,t){return Object.prototype.hasOwnProperty.call(e,t)},r={},o="_N_E:",d.l=function(e,t,n,u){if(r[e]){r[e].push(t);return}if(void 0!==n)for(var i,c,f=document.getElementsByTagName("script"),a=0;a<f.length;a++){var l=f[a];if(l.getAttribute("src")==e||l.getAttribute("data-webpack")==o+n){i=l;break}}i||(c=!0,(i=document.createElement("script")).charset="utf-8",i.timeout=120,d.nc&&i.setAttribute("nonce",d.nc),i.setAttribute("data-webpack",o+n),i.src=d.tu(e)),r[e]=[t];var s=function(t,n){i.onerror=i.onload=null,clearTimeout(p);var o=r[e];if(delete r[e],i.parentNode&&i.parentNode.removeChild(i),o&&o.forEach(function(e){return e(n)}),t)return t(n)},p=setTimeout(s.bind(null,void 0,{type:"timeout",target:i}),12e4);i.onerror=s.bind(null,i.onerror),i.onload=s.bind(null,i.onload),c&&document.head.appendChild(i)},d.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},d.nmd=function(e){return e.paths=[],e.children||(e.children=[]),e},d.tt=function(){return void 0===u&&(u={createScriptURL:function(e){return e}},"undefined"!=typeof trustedTypes&&trustedTypes.createPolicy&&(u=trustedTypes.createPolicy("nextjs#bundler",u))),u},d.tu=function(e){return d.tt().createScriptURL(e)},d.p="/ui/_next/",i={272:0},d.f.j=function(e,t){var n=d.o(i,e)?i[e]:void 0;if(0!==n){if(n)t.push(n[2]);else if(272!=e){var r=new Promise(function(t,r){n=i[e]=[t,r]});t.push(n[2]=r);var o=d.p+d.u(e),u=Error();d.l(o,function(t){if(d.o(i,e)&&(0!==(n=i[e])&&(i[e]=void 0),n)){var r=t&&("load"===t.type?"missing":t.type),o=t&&t.target&&t.target.src;u.message="Loading chunk "+e+" failed.\n("+r+": "+o+")",u.name="ChunkLoadError",u.type=r,u.request=o,n[1](u)}},"chunk-"+e,e)}else i[e]=0}},d.O.j=function(e){return 0===i[e]},c=function(e,t){var n,r,o=t[0],u=t[1],c=t[2],f=0;if(o.some(function(e){return 0!==i[e]})){for(n in u)d.o(u,n)&&(d.m[n]=u[n]);if(c)var a=c(d)}for(e&&e(t);f<o.length;f++)r=o[f],d.o(i,r)&&i[r]&&i[r][0](),i[r]=0;return d.O(a)},(f=self.webpackChunk_N_E=self.webpackChunk_N_E||[]).forEach(c.bind(null,0)),f.push=c.bind(null,f.push.bind(f))}();

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View file

@ -1 +1 @@
<!DOCTYPE html><html id="__next_error__"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-202e312607f242a1.js" crossorigin=""/><script src="/ui/_next/static/chunks/fd9d1056-dafd44dfa2da140c.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/69-e49705773ae41779.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/main-app-9b4fb13a7db53edf.js" async="" crossorigin=""></script><title>LiteLLM Dashboard</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js" crossorigin="" noModule=""></script></head><body><script src="/ui/_next/static/chunks/webpack-202e312607f242a1.js" crossorigin="" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/00c2ddbcd01819c0.css\",\"style\",{\"crossOrigin\":\"\"}]\n0:\"$L3\"\n"])</script><script>self.__next_f.push([1,"4:I[47690,[],\"\"]\n6:I[77831,[],\"\"]\n7:I[58854,[\"936\",\"static/chunks/2f6dbc85-17d29013b8ff3da5.js\",\"142\",\"static/chunks/142-11990a208bf93746.js\",\"931\",\"static/chunks/app/page-d9bdfedbff191985.js\"],\"\"]\n8:I[5613,[],\"\"]\n9:I[31778,[],\"\"]\nb:I[48955,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"3:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/00c2ddbcd01819c0.css\",\"precedence\":\"next\",\"crossOrigin\":\"\"}]],[\"$\",\"$L4\",null,{\"buildId\":\"e55gTzpa2g2-9SwXgA9Uo\",\"assetPrefix\":\"/ui\",\"initialCanonicalUrl\":\"/\",\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[\"$L5\",[\"$\",\"$L6\",null,{\"propsForComponent\":{\"params\":{}},\"Component\":\"$7\",\"isStaticGeneration\":true}],null]]},[null,[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_c23dc8\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[],\"styles\":null}]}]}],null]],\"initialHead\":[false,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"LiteLLM Dashboard\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script><script>self.__next_f.push([1,""])</script></body></html> <!DOCTYPE html><html id="__next_error__"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-5b257e1ab47d4b4a.js" crossorigin=""/><script src="/ui/_next/static/chunks/fd9d1056-dafd44dfa2da140c.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/69-e49705773ae41779.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/main-app-9b4fb13a7db53edf.js" async="" crossorigin=""></script><title>LiteLLM Dashboard</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js" crossorigin="" noModule=""></script></head><body><script src="/ui/_next/static/chunks/webpack-5b257e1ab47d4b4a.js" crossorigin="" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/a1602eb39f799143.css\",\"style\",{\"crossOrigin\":\"\"}]\n0:\"$L3\"\n"])</script><script>self.__next_f.push([1,"4:I[47690,[],\"\"]\n6:I[77831,[],\"\"]\n7:I[25539,[\"936\",\"static/chunks/2f6dbc85-17d29013b8ff3da5.js\",\"566\",\"static/chunks/566-ccd699ab19124658.js\",\"931\",\"static/chunks/app/page-c804e862b63be987.js\"],\"\"]\n8:I[5613,[],\"\"]\n9:I[31778,[],\"\"]\nb:I[48955,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"3:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/a1602eb39f799143.css\",\"precedence\":\"next\",\"crossOrigin\":\"\"}]],[\"$\",\"$L4\",null,{\"buildId\":\"K8KXTbmuI2ArWjjdMi2iq\",\"assetPrefix\":\"/ui\",\"initialCanonicalUrl\":\"/\",\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[\"$L5\",[\"$\",\"$L6\",null,{\"propsForComponent\":{\"params\":{}},\"Component\":\"$7\",\"isStaticGeneration\":true}],null]]},[null,[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_c23dc8\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[],\"styles\":null}]}]}],null]],\"initialHead\":[false,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"LiteLLM Dashboard\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script><script>self.__next_f.push([1,""])</script></body></html>

View file

@ -1,7 +1,7 @@
2:I[77831,[],""] 2:I[77831,[],""]
3:I[58854,["936","static/chunks/2f6dbc85-17d29013b8ff3da5.js","142","static/chunks/142-11990a208bf93746.js","931","static/chunks/app/page-d9bdfedbff191985.js"],""] 3:I[25539,["936","static/chunks/2f6dbc85-17d29013b8ff3da5.js","566","static/chunks/566-ccd699ab19124658.js","931","static/chunks/app/page-c804e862b63be987.js"],""]
4:I[5613,[],""] 4:I[5613,[],""]
5:I[31778,[],""] 5:I[31778,[],""]
0:["e55gTzpa2g2-9SwXgA9Uo",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_c23dc8","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/00c2ddbcd01819c0.css","precedence":"next","crossOrigin":""}]],"$L6"]]]] 0:["K8KXTbmuI2ArWjjdMi2iq",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_c23dc8","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/a1602eb39f799143.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
6:[["$","meta","0",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","meta","1",{"charSet":"utf-8"}],["$","title","2",{"children":"LiteLLM Dashboard"}],["$","meta","3",{"name":"description","content":"LiteLLM Proxy Admin UI"}],["$","link","4",{"rel":"icon","href":"/ui/favicon.ico","type":"image/x-icon","sizes":"16x16"}],["$","meta","5",{"name":"next-size-adjust"}]] 6:[["$","meta","0",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","meta","1",{"charSet":"utf-8"}],["$","title","2",{"children":"LiteLLM Dashboard"}],["$","meta","3",{"name":"description","content":"LiteLLM Proxy Admin UI"}],["$","link","4",{"rel":"icon","href":"/ui/favicon.ico","type":"image/x-icon","sizes":"16x16"}],["$","meta","5",{"name":"next-size-adjust"}]]
1:null 1:null

View file

@ -4,6 +4,22 @@ model_list:
api_key: my-fake-key api_key: my-fake-key
model: openai/my-fake-model model: openai/my-fake-model
model_name: fake-openai-endpoint model_name: fake-openai-endpoint
- litellm_params:
api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/
api_key: my-fake-key-2
model: openai/my-fake-model-2
model_name: fake-openai-endpoint
- litellm_params:
api_base: https://openai-function-calling-workers.tasslexyz.workers.dev/
api_key: my-fake-key-3
model: openai/my-fake-model-3
model_name: fake-openai-endpoint
- model_name: gpt-4
litellm_params:
model: gpt-3.5-turbo
- litellm_params:
model: together_ai/codellama/CodeLlama-13b-Instruct-hf
model_name: CodeLlama-13b-Instruct
router_settings: router_settings:
num_retries: 0 num_retries: 0
enable_pre_call_checks: true enable_pre_call_checks: true
@ -15,8 +31,11 @@ router_settings:
routing_strategy: "latency-based-routing" routing_strategy: "latency-based-routing"
litellm_settings: litellm_settings:
success_callback: ["openmeter"] success_callback: ["langfuse"]
general_settings: general_settings:
alerting: ["slack"] alerting: ["slack"]
alert_types: ["llm_exceptions"] alert_types: ["llm_exceptions", "daily_reports"]
alerting_args:
daily_report_frequency: 60 # every minute
report_check_interval: 5 # every 5s

View file

@ -458,6 +458,27 @@ class UpdateUserRequest(GenerateRequestBase):
return values return values
class NewEndUserRequest(LiteLLMBase):
user_id: str
alias: Optional[str] = None # human-friendly alias
blocked: bool = False # allow/disallow requests for this end-user
max_budget: Optional[float] = None
budget_id: Optional[str] = None # give either a budget_id or max_budget
allowed_model_region: Optional[Literal["eu"]] = (
None # require all user requests to use models in this specific region
)
default_model: Optional[str] = (
None # if no equivalent model in allowed region - default all requests to this model
)
@root_validator(pre=True)
def check_user_info(cls, values):
if values.get("max_budget") is not None and values.get("budget_id") is not None:
raise ValueError("Set either 'max_budget' or 'budget_id', not both.")
return values
class Member(LiteLLMBase): class Member(LiteLLMBase):
role: Literal["admin", "user"] role: Literal["admin", "user"]
user_id: Optional[str] = None user_id: Optional[str] = None
@ -494,6 +515,8 @@ class NewTeamRequest(TeamBase):
class GlobalEndUsersSpend(LiteLLMBase): class GlobalEndUsersSpend(LiteLLMBase):
api_key: Optional[str] = None api_key: Optional[str] = None
startTime: Optional[datetime] = None
endTime: Optional[datetime] = None
class TeamMemberAddRequest(LiteLLMBase): class TeamMemberAddRequest(LiteLLMBase):
@ -836,6 +859,7 @@ class UserAPIKeyAuth(
api_key: Optional[str] = None api_key: Optional[str] = None
user_role: Optional[Literal["proxy_admin", "app_owner", "app_user"]] = None user_role: Optional[Literal["proxy_admin", "app_owner", "app_user"]] = None
allowed_model_region: Optional[Literal["eu"]] = None
@root_validator(pre=True) @root_validator(pre=True)
def check_api_key(cls, values): def check_api_key(cls, values):
@ -881,6 +905,8 @@ class LiteLLM_EndUserTable(LiteLLMBase):
blocked: bool blocked: bool
alias: Optional[str] = None alias: Optional[str] = None
spend: float = 0.0 spend: float = 0.0
allowed_model_region: Optional[Literal["eu"]] = None
default_model: Optional[str] = None
litellm_budget_table: Optional[LiteLLM_BudgetTable] = None litellm_budget_table: Optional[LiteLLM_BudgetTable] = None
@root_validator(pre=True) @root_validator(pre=True)

View file

@ -206,9 +206,9 @@ async def get_end_user_object(
if end_user_id is None: if end_user_id is None:
return None return None
_key = "end_user_id:{}".format(end_user_id)
# check if in cache # check if in cache
cached_user_obj = user_api_key_cache.async_get_cache(key=end_user_id) cached_user_obj = await user_api_key_cache.async_get_cache(key=_key)
if cached_user_obj is not None: if cached_user_obj is not None:
if isinstance(cached_user_obj, dict): if isinstance(cached_user_obj, dict):
return LiteLLM_EndUserTable(**cached_user_obj) return LiteLLM_EndUserTable(**cached_user_obj)
@ -223,7 +223,14 @@ async def get_end_user_object(
if response is None: if response is None:
raise Exception raise Exception
return LiteLLM_EndUserTable(**response.dict()) # save the end-user object to cache
await user_api_key_cache.async_set_cache(
key="end_user_id:{}".format(end_user_id), value=response
)
_response = LiteLLM_EndUserTable(**response.dict())
return _response
except Exception as e: # if end-user not in db except Exception as e: # if end-user not in db
return None return None

View file

@ -15,6 +15,9 @@ from litellm.proxy._types import LiteLLM_JWTAuth, LiteLLM_UserTable
from litellm.proxy.utils import PrismaClient from litellm.proxy.utils import PrismaClient
from litellm.llms.custom_httpx.httpx_handler import HTTPHandler from litellm.llms.custom_httpx.httpx_handler import HTTPHandler
from typing import Optional from typing import Optional
from cryptography import x509
from cryptography.hazmat.backends import default_backend
from cryptography.hazmat.primitives import serialization
class JWTHandler: class JWTHandler:
@ -142,8 +145,8 @@ class JWTHandler:
public_key = keys[0] public_key = keys[0]
elif len(keys) > 1: elif len(keys) > 1:
for key in keys: for key in keys:
if kid is not None and key["kid"] == kid: if kid is not None and key == kid:
public_key = key public_key = keys[key]
if public_key is None: if public_key is None:
raise Exception( raise Exception(
@ -153,6 +156,11 @@ class JWTHandler:
return public_key return public_key
async def auth_jwt(self, token: str) -> dict: async def auth_jwt(self, token: str) -> dict:
audience = os.getenv("JWT_AUDIENCE")
decode_options = None
if audience is None:
decode_options = {"verify_aud": False}
from jwt.algorithms import RSAAlgorithm from jwt.algorithms import RSAAlgorithm
header = jwt.get_unverified_header(token) header = jwt.get_unverified_header(token)
@ -182,7 +190,33 @@ class JWTHandler:
token, token,
public_key_rsa, # type: ignore public_key_rsa, # type: ignore
algorithms=["RS256"], algorithms=["RS256"],
options={"verify_aud": False}, options=decode_options,
audience=audience,
)
return payload
except jwt.ExpiredSignatureError:
# the token is expired, do something to refresh it
raise Exception("Token Expired")
except Exception as e:
raise Exception(f"Validation fails: {str(e)}")
elif public_key is not None and isinstance(public_key, str):
try:
cert = x509.load_pem_x509_certificate(public_key.encode(), default_backend())
# Extract public key
key = cert.public_key().public_bytes(
serialization.Encoding.PEM,
serialization.PublicFormat.SubjectPublicKeyInfo
)
# decode the token using the public key
payload = jwt.decode(
token,
key,
algorithms=["RS256"],
audience=audience,
options=decode_options
) )
return payload return payload

View file

@ -252,7 +252,7 @@ def run_server(
if model and "ollama" in model and api_base is None: if model and "ollama" in model and api_base is None:
run_ollama_serve() run_ollama_serve()
if test_async is True: if test_async is True:
import requests, concurrent, time import requests, concurrent, time # type: ignore
api_base = f"http://{host}:{port}" api_base = f"http://{host}:{port}"
@ -418,7 +418,7 @@ def run_server(
read from there and save it to os.env['DATABASE_URL'] read from there and save it to os.env['DATABASE_URL']
""" """
try: try:
import yaml, asyncio import yaml, asyncio # type: ignore
except: except:
raise ImportError( raise ImportError(
"yaml needs to be imported. Run - `pip install 'litellm[proxy]'`" "yaml needs to be imported. Run - `pip install 'litellm[proxy]'`"

File diff suppressed because it is too large Load diff

View file

@ -150,6 +150,8 @@ model LiteLLM_EndUserTable {
user_id String @id user_id String @id
alias String? // admin-facing alias alias String? // admin-facing alias
spend Float @default(0.0) spend Float @default(0.0)
allowed_model_region String? // require all user requests to use models in this specific region
default_model String? // use along with 'allowed_model_region'. if no available model in region, default to this model.
budget_id String? budget_id String?
litellm_budget_table LiteLLM_BudgetTable? @relation(fields: [budget_id], references: [budget_id]) litellm_budget_table LiteLLM_BudgetTable? @relation(fields: [budget_id], references: [budget_id])
blocked Boolean @default(false) blocked Boolean @default(false)

View file

@ -73,6 +73,7 @@ class ProxyLogging:
"llm_requests_hanging", "llm_requests_hanging",
"budget_alerts", "budget_alerts",
"db_exceptions", "db_exceptions",
"daily_reports",
] ]
] = [ ] = [
"llm_exceptions", "llm_exceptions",
@ -80,11 +81,13 @@ class ProxyLogging:
"llm_requests_hanging", "llm_requests_hanging",
"budget_alerts", "budget_alerts",
"db_exceptions", "db_exceptions",
"daily_reports",
] ]
self.slack_alerting_instance = SlackAlerting( self.slack_alerting_instance = SlackAlerting(
alerting_threshold=self.alerting_threshold, alerting_threshold=self.alerting_threshold,
alerting=self.alerting, alerting=self.alerting,
alert_types=self.alert_types, alert_types=self.alert_types,
internal_usage_cache=self.internal_usage_cache,
) )
def update_values( def update_values(
@ -100,9 +103,11 @@ class ProxyLogging:
"llm_requests_hanging", "llm_requests_hanging",
"budget_alerts", "budget_alerts",
"db_exceptions", "db_exceptions",
"daily_reports",
] ]
] ]
] = None, ] = None,
alerting_args: Optional[dict] = None,
): ):
self.alerting = alerting self.alerting = alerting
if alerting_threshold is not None: if alerting_threshold is not None:
@ -114,8 +119,12 @@ class ProxyLogging:
alerting=self.alerting, alerting=self.alerting,
alerting_threshold=self.alerting_threshold, alerting_threshold=self.alerting_threshold,
alert_types=self.alert_types, alert_types=self.alert_types,
alerting_args=alerting_args,
) )
if "daily_reports" in self.alert_types:
litellm.callbacks.append(self.slack_alerting_instance) # type: ignore
if redis_cache is not None: if redis_cache is not None:
self.internal_usage_cache.redis_cache = redis_cache self.internal_usage_cache.redis_cache = redis_cache
@ -293,6 +302,7 @@ class ProxyLogging:
"budget_alerts", "budget_alerts",
"db_exceptions", "db_exceptions",
], ],
request_data: Optional[dict] = None,
): ):
""" """
Alerting based on thresholds: - https://github.com/BerriAI/litellm/issues/1298 Alerting based on thresholds: - https://github.com/BerriAI/litellm/issues/1298
@ -322,10 +332,19 @@ class ProxyLogging:
if _proxy_base_url is not None: if _proxy_base_url is not None:
formatted_message += f"\n\nProxy URL: `{_proxy_base_url}`" formatted_message += f"\n\nProxy URL: `{_proxy_base_url}`"
extra_kwargs = {}
if request_data is not None:
_url = self.slack_alerting_instance._add_langfuse_trace_id_to_alert(
request_data=request_data
)
if _url is not None:
extra_kwargs["🪢 Langfuse Trace"] = _url
formatted_message += "\n\n🪢 Langfuse Trace: {}".format(_url)
for client in self.alerting: for client in self.alerting:
if client == "slack": if client == "slack":
await self.slack_alerting_instance.send_alert( await self.slack_alerting_instance.send_alert(
message=message, level=level, alert_type=alert_type message=message, level=level, alert_type=alert_type, **extra_kwargs
) )
elif client == "sentry": elif client == "sentry":
if litellm.utils.sentry_sdk_instance is not None: if litellm.utils.sentry_sdk_instance is not None:
@ -360,6 +379,7 @@ class ProxyLogging:
message=f"DB read/write call failed: {error_message}", message=f"DB read/write call failed: {error_message}",
level="High", level="High",
alert_type="db_exceptions", alert_type="db_exceptions",
request_data={},
) )
) )
@ -375,7 +395,10 @@ class ProxyLogging:
litellm.utils.capture_exception(error=original_exception) litellm.utils.capture_exception(error=original_exception)
async def post_call_failure_hook( async def post_call_failure_hook(
self, original_exception: Exception, user_api_key_dict: UserAPIKeyAuth self,
original_exception: Exception,
user_api_key_dict: UserAPIKeyAuth,
request_data: dict,
): ):
""" """
Allows users to raise custom exceptions/log when a call fails, without having to deal with parsing Request body. Allows users to raise custom exceptions/log when a call fails, without having to deal with parsing Request body.
@ -400,6 +423,7 @@ class ProxyLogging:
message=f"LLM API call failed: {str(original_exception)}", message=f"LLM API call failed: {str(original_exception)}",
level="High", level="High",
alert_type="llm_exceptions", alert_type="llm_exceptions",
request_data=request_data,
) )
) )
@ -502,7 +526,7 @@ class PrismaClient:
finally: finally:
os.chdir(original_dir) os.chdir(original_dir)
# Now you can import the Prisma Client # Now you can import the Prisma Client
from prisma import Prisma # type: ignore from prisma import Prisma
self.db = Prisma() # Client to connect to Prisma db self.db = Prisma() # Client to connect to Prisma db
@ -1665,12 +1689,12 @@ def get_instance_fn(value: str, config_file_path: Optional[str] = None) -> Any:
module_file_path = os.path.join(directory, *module_name.split(".")) module_file_path = os.path.join(directory, *module_name.split("."))
module_file_path += ".py" module_file_path += ".py"
spec = importlib.util.spec_from_file_location(module_name, module_file_path) spec = importlib.util.spec_from_file_location(module_name, module_file_path) # type: ignore
if spec is None: if spec is None:
raise ImportError( raise ImportError(
f"Could not find a module specification for {module_file_path}" f"Could not find a module specification for {module_file_path}"
) )
module = importlib.util.module_from_spec(spec) module = importlib.util.module_from_spec(spec) # type: ignore
spec.loader.exec_module(module) # type: ignore spec.loader.exec_module(module) # type: ignore
else: else:
# Dynamically import the module # Dynamically import the module

View file

@ -21,6 +21,7 @@ from collections import defaultdict
from litellm.router_strategy.least_busy import LeastBusyLoggingHandler from litellm.router_strategy.least_busy import LeastBusyLoggingHandler
from litellm.router_strategy.lowest_tpm_rpm import LowestTPMLoggingHandler from litellm.router_strategy.lowest_tpm_rpm import LowestTPMLoggingHandler
from litellm.router_strategy.lowest_latency import LowestLatencyLoggingHandler from litellm.router_strategy.lowest_latency import LowestLatencyLoggingHandler
from litellm.router_strategy.lowest_cost import LowestCostLoggingHandler
from litellm.router_strategy.lowest_tpm_rpm_v2 import LowestTPMLoggingHandler_v2 from litellm.router_strategy.lowest_tpm_rpm_v2 import LowestTPMLoggingHandler_v2
from litellm.llms.custom_httpx.azure_dall_e_2 import ( from litellm.llms.custom_httpx.azure_dall_e_2 import (
CustomHTTPTransport, CustomHTTPTransport,
@ -31,6 +32,7 @@ from litellm.utils import (
CustomStreamWrapper, CustomStreamWrapper,
get_utc_datetime, get_utc_datetime,
calculate_max_parallel_requests, calculate_max_parallel_requests,
_is_region_eu,
) )
import copy import copy
from litellm._logging import verbose_router_logger from litellm._logging import verbose_router_logger
@ -43,6 +45,7 @@ from litellm.types.router import (
updateDeployment, updateDeployment,
updateLiteLLMParams, updateLiteLLMParams,
RetryPolicy, RetryPolicy,
AlertingConfig,
) )
from litellm.integrations.custom_logger import CustomLogger from litellm.integrations.custom_logger import CustomLogger
@ -98,9 +101,11 @@ class Router:
"least-busy", "least-busy",
"usage-based-routing", "usage-based-routing",
"latency-based-routing", "latency-based-routing",
"cost-based-routing",
] = "simple-shuffle", ] = "simple-shuffle",
routing_strategy_args: dict = {}, # just for latency-based routing routing_strategy_args: dict = {}, # just for latency-based routing
semaphore: Optional[asyncio.Semaphore] = None, semaphore: Optional[asyncio.Semaphore] = None,
alerting_config: Optional[AlertingConfig] = None,
) -> None: ) -> None:
""" """
Initialize the Router class with the given parameters for caching, reliability, and routing strategy. Initialize the Router class with the given parameters for caching, reliability, and routing strategy.
@ -127,9 +132,9 @@ class Router:
retry_after (int): Minimum time to wait before retrying a failed request. Defaults to 0. retry_after (int): Minimum time to wait before retrying a failed request. Defaults to 0.
allowed_fails (Optional[int]): Number of allowed fails before adding to cooldown. Defaults to None. allowed_fails (Optional[int]): Number of allowed fails before adding to cooldown. Defaults to None.
cooldown_time (float): Time to cooldown a deployment after failure in seconds. Defaults to 1. cooldown_time (float): Time to cooldown a deployment after failure in seconds. Defaults to 1.
routing_strategy (Literal["simple-shuffle", "least-busy", "usage-based-routing", "latency-based-routing"]): Routing strategy. Defaults to "simple-shuffle". routing_strategy (Literal["simple-shuffle", "least-busy", "usage-based-routing", "latency-based-routing", "cost-based-routing"]): Routing strategy. Defaults to "simple-shuffle".
routing_strategy_args (dict): Additional args for latency-based routing. Defaults to {}. routing_strategy_args (dict): Additional args for latency-based routing. Defaults to {}.
alerting_config (AlertingConfig): Slack alerting configuration. Defaults to None.
Returns: Returns:
Router: An instance of the litellm.Router class. Router: An instance of the litellm.Router class.
@ -314,6 +319,9 @@ class Router:
self.model_group_retry_policy: Optional[Dict[str, RetryPolicy]] = ( self.model_group_retry_policy: Optional[Dict[str, RetryPolicy]] = (
model_group_retry_policy model_group_retry_policy
) )
self.alerting_config: Optional[AlertingConfig] = alerting_config
if self.alerting_config is not None:
self._initialize_alerting()
def routing_strategy_init(self, routing_strategy: str, routing_strategy_args: dict): def routing_strategy_init(self, routing_strategy: str, routing_strategy_args: dict):
if routing_strategy == "least-busy": if routing_strategy == "least-busy":
@ -347,6 +355,14 @@ class Router:
) )
if isinstance(litellm.callbacks, list): if isinstance(litellm.callbacks, list):
litellm.callbacks.append(self.lowestlatency_logger) # type: ignore litellm.callbacks.append(self.lowestlatency_logger) # type: ignore
elif routing_strategy == "cost-based-routing":
self.lowestcost_logger = LowestCostLoggingHandler(
router_cache=self.cache,
model_list=self.model_list,
routing_args={},
)
if isinstance(litellm.callbacks, list):
litellm.callbacks.append(self.lowestcost_logger) # type: ignore
def print_deployment(self, deployment: dict): def print_deployment(self, deployment: dict):
""" """
@ -1847,6 +1863,10 @@ class Router:
self.cache.set_cache( self.cache.set_cache(
value=cached_value, key=cooldown_key, ttl=cooldown_time value=cached_value, key=cooldown_key, ttl=cooldown_time
) )
self.send_deployment_cooldown_alert(
deployment_id=deployment, exception_status=exception_status
)
else: else:
self.failed_calls.set_cache( self.failed_calls.set_cache(
key=deployment, value=updated_fails, ttl=cooldown_time key=deployment, value=updated_fails, ttl=cooldown_time
@ -1980,7 +2000,11 @@ class Router:
# user can pass vars directly or they can pas os.environ/AZURE_API_KEY, in which case we will read the env # user can pass vars directly or they can pas os.environ/AZURE_API_KEY, in which case we will read the env
# we do this here because we init clients for Azure, OpenAI and we need to set the right key # we do this here because we init clients for Azure, OpenAI and we need to set the right key
api_key = litellm_params.get("api_key") or default_api_key api_key = litellm_params.get("api_key") or default_api_key
if api_key and api_key.startswith("os.environ/"): if (
api_key
and isinstance(api_key, str)
and api_key.startswith("os.environ/")
):
api_key_env_name = api_key.replace("os.environ/", "") api_key_env_name = api_key.replace("os.environ/", "")
api_key = litellm.get_secret(api_key_env_name) api_key = litellm.get_secret(api_key_env_name)
litellm_params["api_key"] = api_key litellm_params["api_key"] = api_key
@ -2004,6 +2028,7 @@ class Router:
if ( if (
is_azure_ai_studio_model == True is_azure_ai_studio_model == True
and api_base is not None and api_base is not None
and isinstance(api_base, str)
and not api_base.endswith("/v1/") and not api_base.endswith("/v1/")
): ):
# check if it ends with a trailing slash # check if it ends with a trailing slash
@ -2084,13 +2109,14 @@ class Router:
organization = litellm.get_secret(organization_env_name) organization = litellm.get_secret(organization_env_name)
litellm_params["organization"] = organization litellm_params["organization"] = organization
if "azure" in model_name: if "azure" in model_name and isinstance(api_key, str):
if api_base is None: if api_base is None or not isinstance(api_base, str):
raise ValueError( raise ValueError(
f"api_base is required for Azure OpenAI. Set it on your config. Model - {model}" f"api_base is required for Azure OpenAI. Set it on your config. Model - {model}"
) )
if api_version is None: if api_version is None:
api_version = "2023-07-01-preview" api_version = "2023-07-01-preview"
if "gateway.ai.cloudflare.com" in api_base: if "gateway.ai.cloudflare.com" in api_base:
if not api_base.endswith("/"): if not api_base.endswith("/"):
api_base += "/" api_base += "/"
@ -2513,7 +2539,7 @@ class Router:
self.default_deployment = deployment.to_json(exclude_none=True) self.default_deployment = deployment.to_json(exclude_none=True)
# Azure GPT-Vision Enhancements, users can pass os.environ/ # Azure GPT-Vision Enhancements, users can pass os.environ/
data_sources = deployment.litellm_params.get("dataSources", []) data_sources = deployment.litellm_params.get("dataSources", []) or []
for data_source in data_sources: for data_source in data_sources:
params = data_source.get("parameters", {}) params = data_source.get("parameters", {})
@ -2530,6 +2556,22 @@ class Router:
# init OpenAI, Azure clients # init OpenAI, Azure clients
self.set_client(model=deployment.to_json(exclude_none=True)) self.set_client(model=deployment.to_json(exclude_none=True))
# set region (if azure model)
try:
if "azure" in deployment.litellm_params.model:
region = litellm.utils.get_model_region(
litellm_params=deployment.litellm_params, mode=None
)
deployment.litellm_params.region_name = region
except Exception as e:
verbose_router_logger.error(
"Unable to get the region for azure model - {}, {}".format(
deployment.litellm_params.model, str(e)
)
)
pass # [NON-BLOCKING]
return deployment return deployment
def add_deployment(self, deployment: Deployment) -> Optional[Deployment]: def add_deployment(self, deployment: Deployment) -> Optional[Deployment]:
@ -2557,6 +2599,38 @@ class Router:
self.model_names.append(deployment.model_name) self.model_names.append(deployment.model_name)
return deployment return deployment
def upsert_deployment(self, deployment: Deployment) -> Deployment:
"""
Add or update deployment
Parameters:
- deployment: Deployment - the deployment to be added to the Router
Returns:
- The added/updated deployment
"""
# check if deployment already exists
if deployment.model_info.id in self.get_model_ids():
# remove the previous deployment
removal_idx: Optional[int] = None
for idx, model in enumerate(self.model_list):
if model["model_info"]["id"] == deployment.model_info.id:
removal_idx = idx
if removal_idx is not None:
self.model_list.pop(removal_idx)
# add to model list
_deployment = deployment.to_json(exclude_none=True)
self.model_list.append(_deployment)
# initialize client
self._add_deployment(deployment=deployment)
# add to model names
self.model_names.append(deployment.model_name)
return deployment
def delete_deployment(self, id: str) -> Optional[Deployment]: def delete_deployment(self, id: str) -> Optional[Deployment]:
""" """
Parameters: Parameters:
@ -2580,11 +2654,21 @@ class Router:
except: except:
return None return None
def get_deployment(self, model_id: str): def get_deployment(self, model_id: str) -> Optional[Deployment]:
"""
Returns -> Deployment or None
Raise Exception -> if model found in invalid format
"""
for model in self.model_list: for model in self.model_list:
if "model_info" in model and "id" in model["model_info"]: if "model_info" in model and "id" in model["model_info"]:
if model_id == model["model_info"]["id"]: if model_id == model["model_info"]["id"]:
if isinstance(model, dict):
return Deployment(**model)
elif isinstance(model, Deployment):
return model return model
else:
raise Exception("Model invalid format - {}".format(type(model)))
return None return None
def get_model_info(self, id: str) -> Optional[dict]: def get_model_info(self, id: str) -> Optional[dict]:
@ -2597,7 +2681,10 @@ class Router:
return model return model
return None return None
def get_model_ids(self): def get_model_ids(self) -> List[str]:
"""
Returns list of model id's.
"""
ids = [] ids = []
for model in self.model_list: for model in self.model_list:
if "model_info" in model and "id" in model["model_info"]: if "model_info" in model and "id" in model["model_info"]:
@ -2605,7 +2692,7 @@ class Router:
ids.append(id) ids.append(id)
return ids return ids
def get_model_names(self): def get_model_names(self) -> List[str]:
return self.model_names return self.model_names
def get_model_list(self): def get_model_list(self):
@ -2631,6 +2718,7 @@ class Router:
"retry_after", "retry_after",
"fallbacks", "fallbacks",
"context_window_fallbacks", "context_window_fallbacks",
"model_group_retry_policy",
] ]
for var in vars_to_include: for var in vars_to_include:
@ -2656,6 +2744,7 @@ class Router:
"retry_after", "retry_after",
"fallbacks", "fallbacks",
"context_window_fallbacks", "context_window_fallbacks",
"model_group_retry_policy",
] ]
_int_settings = [ _int_settings = [
@ -2754,14 +2843,17 @@ class Router:
model: str, model: str,
healthy_deployments: List, healthy_deployments: List,
messages: List[Dict[str, str]], messages: List[Dict[str, str]],
allowed_model_region: Optional[Literal["eu"]] = None,
): ):
""" """
Filter out model in model group, if: Filter out model in model group, if:
- model context window < message length - model context window < message length
- filter models above rpm limits - filter models above rpm limits
- if region given, filter out models not in that region / unknown region
- [TODO] function call and model doesn't support function calling - [TODO] function call and model doesn't support function calling
""" """
verbose_router_logger.debug( verbose_router_logger.debug(
f"Starting Pre-call checks for deployments in model={model}" f"Starting Pre-call checks for deployments in model={model}"
) )
@ -2812,9 +2904,9 @@ class Router:
except Exception as e: except Exception as e:
verbose_router_logger.debug("An error occurs - {}".format(str(e))) verbose_router_logger.debug("An error occurs - {}".format(str(e)))
## RPM CHECK ##
_litellm_params = deployment.get("litellm_params", {}) _litellm_params = deployment.get("litellm_params", {})
model_id = deployment.get("model_info", {}).get("id", "") model_id = deployment.get("model_info", {}).get("id", "")
## RPM CHECK ##
### get local router cache ### ### get local router cache ###
current_request_cache_local = ( current_request_cache_local = (
self.cache.get_cache(key=model_id, local_only=True) or 0 self.cache.get_cache(key=model_id, local_only=True) or 0
@ -2842,6 +2934,28 @@ class Router:
_rate_limit_error = True _rate_limit_error = True
continue continue
## REGION CHECK ##
if allowed_model_region is not None:
if _litellm_params.get("region_name") is not None and isinstance(
_litellm_params["region_name"], str
):
# check if in allowed_model_region
if (
_is_region_eu(model_region=_litellm_params["region_name"])
== False
):
invalid_model_indices.append(idx)
continue
else:
verbose_router_logger.debug(
"Filtering out model - {}, as model_region=None, and allowed_model_region={}".format(
model_id, allowed_model_region
)
)
# filter out since region unknown, and user wants to filter for specific region
invalid_model_indices.append(idx)
continue
if len(invalid_model_indices) == len(_returned_deployments): if len(invalid_model_indices) == len(_returned_deployments):
""" """
- no healthy deployments available b/c context window checks or rate limit error - no healthy deployments available b/c context window checks or rate limit error
@ -2943,6 +3057,7 @@ class Router:
if ( if (
self.routing_strategy != "usage-based-routing-v2" self.routing_strategy != "usage-based-routing-v2"
and self.routing_strategy != "simple-shuffle" and self.routing_strategy != "simple-shuffle"
and self.routing_strategy != "cost-based-routing"
): # prevent regressions for other routing strategies, that don't have async get available deployments implemented. ): # prevent regressions for other routing strategies, that don't have async get available deployments implemented.
return self.get_available_deployment( return self.get_available_deployment(
model=model, model=model,
@ -2980,8 +3095,29 @@ class Router:
# filter pre-call checks # filter pre-call checks
if self.enable_pre_call_checks and messages is not None: if self.enable_pre_call_checks and messages is not None:
_allowed_model_region = (
request_kwargs.get("allowed_model_region")
if request_kwargs is not None
else None
)
if _allowed_model_region == "eu":
healthy_deployments = self._pre_call_checks( healthy_deployments = self._pre_call_checks(
model=model, healthy_deployments=healthy_deployments, messages=messages model=model,
healthy_deployments=healthy_deployments,
messages=messages,
allowed_model_region=_allowed_model_region,
)
else:
verbose_router_logger.debug(
"Ignoring given 'allowed_model_region'={}. Only 'eu' is allowed".format(
_allowed_model_region
)
)
healthy_deployments = self._pre_call_checks(
model=model,
healthy_deployments=healthy_deployments,
messages=messages,
) )
if len(healthy_deployments) == 0: if len(healthy_deployments) == 0:
@ -2999,6 +3135,16 @@ class Router:
messages=messages, messages=messages,
input=input, input=input,
) )
if (
self.routing_strategy == "cost-based-routing"
and self.lowestcost_logger is not None
):
deployment = await self.lowestcost_logger.async_get_available_deployments(
model_group=model,
healthy_deployments=healthy_deployments,
messages=messages,
input=input,
)
elif self.routing_strategy == "simple-shuffle": elif self.routing_strategy == "simple-shuffle":
# if users pass rpm or tpm, we do a random weighted pick - based on rpm/tpm # if users pass rpm or tpm, we do a random weighted pick - based on rpm/tpm
############## Check if we can do a RPM/TPM based weighted pick ################# ############## Check if we can do a RPM/TPM based weighted pick #################
@ -3266,6 +3412,8 @@ class Router:
if retry_policy is None: if retry_policy is None:
return None return None
if isinstance(retry_policy, dict):
retry_policy = RetryPolicy(**retry_policy)
if ( if (
isinstance(exception, litellm.BadRequestError) isinstance(exception, litellm.BadRequestError)
and retry_policy.BadRequestErrorRetries is not None and retry_policy.BadRequestErrorRetries is not None
@ -3292,6 +3440,56 @@ class Router:
): ):
return retry_policy.ContentPolicyViolationErrorRetries return retry_policy.ContentPolicyViolationErrorRetries
def _initialize_alerting(self):
from litellm.integrations.slack_alerting import SlackAlerting
router_alerting_config: AlertingConfig = self.alerting_config
_slack_alerting_logger = SlackAlerting(
alerting_threshold=router_alerting_config.alerting_threshold,
alerting=["slack"],
default_webhook_url=router_alerting_config.webhook_url,
)
litellm.callbacks.append(_slack_alerting_logger)
litellm.success_callback.append(
_slack_alerting_logger.response_taking_too_long_callback
)
print("\033[94m\nInitialized Alerting for litellm.Router\033[0m\n") # noqa
def send_deployment_cooldown_alert(
self, deployment_id: str, exception_status: Union[str, int]
):
try:
from litellm.proxy.proxy_server import proxy_logging_obj
# trigger slack alert saying deployment is in cooldown
if (
proxy_logging_obj is not None
and proxy_logging_obj.alerting is not None
and "slack" in proxy_logging_obj.alerting
):
_deployment = self.get_deployment(model_id=deployment_id)
if _deployment is None:
return
_litellm_params = _deployment["litellm_params"]
temp_litellm_params = copy.deepcopy(_litellm_params)
temp_litellm_params = dict(temp_litellm_params)
_model_name = _deployment.get("model_name", None)
_api_base = litellm.get_api_base(
model=_model_name, optional_params=temp_litellm_params
)
asyncio.create_task(
proxy_logging_obj.slack_alerting_instance.send_alert(
message=f"Router: Cooling down deployment: {_api_base}, for {self.cooldown_time} seconds. Got exception: {str(exception_status)}",
alert_type="cooldown_deployment",
level="Low",
)
)
except Exception as e:
pass
def flush_cache(self): def flush_cache(self):
litellm.cache = None litellm.cache = None
self.cache.flush_cache() self.cache.flush_cache()

View file

@ -6,7 +6,7 @@
# - use litellm.success + failure callbacks to log when a request completed # - use litellm.success + failure callbacks to log when a request completed
# - in get_available_deployment, for a given model group name -> pick based on traffic # - in get_available_deployment, for a given model group name -> pick based on traffic
import dotenv, os, requests, random import dotenv, os, requests, random # type: ignore
from typing import Optional from typing import Optional
dotenv.load_dotenv() # Loading env variables using dotenv dotenv.load_dotenv() # Loading env variables using dotenv

View file

@ -0,0 +1,350 @@
#### What this does ####
# picks based on response time (for streaming, this is time to first token)
from pydantic import BaseModel, Extra, Field, root_validator
import dotenv, os, requests, random # type: ignore
from typing import Optional, Union, List, Dict
from datetime import datetime, timedelta
import random
dotenv.load_dotenv() # Loading env variables using dotenv
import traceback
from litellm.caching import DualCache
from litellm.integrations.custom_logger import CustomLogger
from litellm._logging import verbose_router_logger
from litellm import ModelResponse
from litellm import token_counter
import litellm
class LiteLLMBase(BaseModel):
"""
Implements default functions, all pydantic objects should have.
"""
def json(self, **kwargs):
try:
return self.model_dump() # noqa
except:
# if using pydantic v1
return self.dict()
class LowestCostLoggingHandler(CustomLogger):
test_flag: bool = False
logged_success: int = 0
logged_failure: int = 0
def __init__(
self, router_cache: DualCache, model_list: list, routing_args: dict = {}
):
self.router_cache = router_cache
self.model_list = model_list
async def log_success_event(self, kwargs, response_obj, start_time, end_time):
try:
"""
Update usage on success
"""
if kwargs["litellm_params"].get("metadata") is None:
pass
else:
model_group = kwargs["litellm_params"]["metadata"].get(
"model_group", None
)
id = kwargs["litellm_params"].get("model_info", {}).get("id", None)
if model_group is None or id is None:
return
elif isinstance(id, int):
id = str(id)
# ------------
# Setup values
# ------------
"""
{
{model_group}_map: {
id: {
f"{date:hour:minute}" : {"tpm": 34, "rpm": 3}
}
}
}
"""
current_date = datetime.now().strftime("%Y-%m-%d")
current_hour = datetime.now().strftime("%H")
current_minute = datetime.now().strftime("%M")
precise_minute = f"{current_date}-{current_hour}-{current_minute}"
cost_key = f"{model_group}_map"
response_ms: timedelta = end_time - start_time
final_value = response_ms
total_tokens = 0
if isinstance(response_obj, ModelResponse):
completion_tokens = response_obj.usage.completion_tokens
total_tokens = response_obj.usage.total_tokens
final_value = float(response_ms.total_seconds() / completion_tokens)
# ------------
# Update usage
# ------------
request_count_dict = (
await self.router_cache.async_get_cache(key=cost_key) or {}
)
# check local result first
if id not in request_count_dict:
request_count_dict[id] = {}
if precise_minute not in request_count_dict[id]:
request_count_dict[id][precise_minute] = {}
if precise_minute not in request_count_dict[id]:
request_count_dict[id][precise_minute] = {}
## TPM
request_count_dict[id][precise_minute]["tpm"] = (
request_count_dict[id][precise_minute].get("tpm", 0) + total_tokens
)
## RPM
request_count_dict[id][precise_minute]["rpm"] = (
request_count_dict[id][precise_minute].get("rpm", 0) + 1
)
await self.router_cache.async_set_cache(
key=cost_key, value=request_count_dict
)
### TESTING ###
if self.test_flag:
self.logged_success += 1
except Exception as e:
traceback.print_exc()
pass
async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
try:
"""
Update cost usage on success
"""
if kwargs["litellm_params"].get("metadata") is None:
pass
else:
model_group = kwargs["litellm_params"]["metadata"].get(
"model_group", None
)
id = kwargs["litellm_params"].get("model_info", {}).get("id", None)
if model_group is None or id is None:
return
elif isinstance(id, int):
id = str(id)
# ------------
# Setup values
# ------------
"""
{
{model_group}_map: {
id: {
"cost": [..]
f"{date:hour:minute}" : {"tpm": 34, "rpm": 3}
}
}
}
"""
cost_key = f"{model_group}_map"
current_date = datetime.now().strftime("%Y-%m-%d")
current_hour = datetime.now().strftime("%H")
current_minute = datetime.now().strftime("%M")
precise_minute = f"{current_date}-{current_hour}-{current_minute}"
response_ms: timedelta = end_time - start_time
final_value = response_ms
total_tokens = 0
if isinstance(response_obj, ModelResponse):
completion_tokens = response_obj.usage.completion_tokens
total_tokens = response_obj.usage.total_tokens
final_value = float(response_ms.total_seconds() / completion_tokens)
# ------------
# Update usage
# ------------
request_count_dict = (
await self.router_cache.async_get_cache(key=cost_key) or {}
)
if id not in request_count_dict:
request_count_dict[id] = {}
if precise_minute not in request_count_dict[id]:
request_count_dict[id][precise_minute] = {}
## TPM
request_count_dict[id][precise_minute]["tpm"] = (
request_count_dict[id][precise_minute].get("tpm", 0) + total_tokens
)
## RPM
request_count_dict[id][precise_minute]["rpm"] = (
request_count_dict[id][precise_minute].get("rpm", 0) + 1
)
await self.router_cache.async_set_cache(
key=cost_key, value=request_count_dict
) # reset map within window
### TESTING ###
if self.test_flag:
self.logged_success += 1
except Exception as e:
traceback.print_exc()
pass
async def async_get_available_deployments(
self,
model_group: str,
healthy_deployments: list,
messages: Optional[List[Dict[str, str]]] = None,
input: Optional[Union[str, List]] = None,
request_kwargs: Optional[Dict] = None,
):
"""
Returns a deployment with the lowest cost
"""
cost_key = f"{model_group}_map"
request_count_dict = await self.router_cache.async_get_cache(key=cost_key) or {}
# -----------------------
# Find lowest used model
# ----------------------
lowest_cost = float("inf")
current_date = datetime.now().strftime("%Y-%m-%d")
current_hour = datetime.now().strftime("%H")
current_minute = datetime.now().strftime("%M")
precise_minute = f"{current_date}-{current_hour}-{current_minute}"
deployment = None
if request_count_dict is None: # base case
return
all_deployments = request_count_dict
for d in healthy_deployments:
## if healthy deployment not yet used
if d["model_info"]["id"] not in all_deployments:
all_deployments[d["model_info"]["id"]] = {
precise_minute: {"tpm": 0, "rpm": 0},
}
try:
input_tokens = token_counter(messages=messages, text=input)
except:
input_tokens = 0
# randomly sample from all_deployments, incase all deployments have latency=0.0
_items = all_deployments.items()
### GET AVAILABLE DEPLOYMENTS ### filter out any deployments > tpm/rpm limits
potential_deployments = []
_cost_per_deployment = {}
for item, item_map in all_deployments.items():
## get the item from model list
_deployment = None
for m in healthy_deployments:
if item == m["model_info"]["id"]:
_deployment = m
if _deployment is None:
continue # skip to next one
_deployment_tpm = (
_deployment.get("tpm", None)
or _deployment.get("litellm_params", {}).get("tpm", None)
or _deployment.get("model_info", {}).get("tpm", None)
or float("inf")
)
_deployment_rpm = (
_deployment.get("rpm", None)
or _deployment.get("litellm_params", {}).get("rpm", None)
or _deployment.get("model_info", {}).get("rpm", None)
or float("inf")
)
item_litellm_model_name = _deployment.get("litellm_params", {}).get("model")
item_litellm_model_cost_map = litellm.model_cost.get(
item_litellm_model_name, {}
)
# check if user provided input_cost_per_token and output_cost_per_token in litellm_params
item_input_cost = None
item_output_cost = None
if _deployment.get("litellm_params", {}).get("input_cost_per_token", None):
item_input_cost = _deployment.get("litellm_params", {}).get(
"input_cost_per_token"
)
if _deployment.get("litellm_params", {}).get("output_cost_per_token", None):
item_output_cost = _deployment.get("litellm_params", {}).get(
"output_cost_per_token"
)
if item_input_cost is None:
item_input_cost = item_litellm_model_cost_map.get(
"input_cost_per_token", 5.0
)
if item_output_cost is None:
item_output_cost = item_litellm_model_cost_map.get(
"output_cost_per_token", 5.0
)
# if litellm["model"] is not in model_cost map -> use item_cost = $10
item_cost = item_input_cost + item_output_cost
item_rpm = item_map.get(precise_minute, {}).get("rpm", 0)
item_tpm = item_map.get(precise_minute, {}).get("tpm", 0)
verbose_router_logger.debug(
f"item_cost: {item_cost}, item_tpm: {item_tpm}, item_rpm: {item_rpm}, model_id: {_deployment.get('model_info', {}).get('id')}"
)
# -------------- #
# Debugging Logic
# -------------- #
# We use _cost_per_deployment to log to langfuse, slack - this is not used to make a decision on routing
# this helps a user to debug why the router picked a specfic deployment #
_deployment_api_base = _deployment.get("litellm_params", {}).get(
"api_base", ""
)
if _deployment_api_base is not None:
_cost_per_deployment[_deployment_api_base] = item_cost
# -------------- #
# End of Debugging Logic
# -------------- #
if (
item_tpm + input_tokens > _deployment_tpm
or item_rpm + 1 > _deployment_rpm
): # if user passed in tpm / rpm in the model_list
continue
else:
potential_deployments.append((_deployment, item_cost))
if len(potential_deployments) == 0:
return None
potential_deployments = sorted(potential_deployments, key=lambda x: x[1])
selected_deployment = potential_deployments[0][0]
return selected_deployment

View file

@ -1,7 +1,7 @@
#### What this does #### #### What this does ####
# picks based on response time (for streaming, this is time to first token) # picks based on response time (for streaming, this is time to first token)
from pydantic import BaseModel, Extra, Field, root_validator from pydantic import BaseModel, Extra, Field, root_validator # type: ignore
import dotenv, os, requests, random import dotenv, os, requests, random # type: ignore
from typing import Optional, Union, List, Dict from typing import Optional, Union, List, Dict
from datetime import datetime, timedelta from datetime import datetime, timedelta
import random import random

File diff suppressed because one or more lines are too long

View file

@ -1,5 +1,6 @@
import pytest import pytest
from litellm import acompletion from litellm import acompletion
from litellm import completion
def test_acompletion_params(): def test_acompletion_params():
@ -7,17 +8,29 @@ def test_acompletion_params():
from litellm.types.completion import CompletionRequest from litellm.types.completion import CompletionRequest
acompletion_params_odict = inspect.signature(acompletion).parameters acompletion_params_odict = inspect.signature(acompletion).parameters
acompletion_params = {name: param.annotation for name, param in acompletion_params_odict.items()} completion_params_dict = inspect.signature(completion).parameters
completion_params = {field_name: field_type for field_name, field_type in CompletionRequest.__annotations__.items()}
# remove kwargs acompletion_params = {
acompletion_params.pop("kwargs", None) name: param.annotation for name, param in acompletion_params_odict.items()
}
completion_params = {
name: param.annotation for name, param in completion_params_dict.items()
}
keys_acompletion = set(acompletion_params.keys()) keys_acompletion = set(acompletion_params.keys())
keys_completion = set(completion_params.keys()) keys_completion = set(completion_params.keys())
print(keys_acompletion)
print("\n\n\n")
print(keys_completion)
print("diff=", keys_completion - keys_acompletion)
# Assert that the parameters are the same # Assert that the parameters are the same
if keys_acompletion != keys_completion: if keys_acompletion != keys_completion:
pytest.fail("The parameters of the acompletion function and the CompletionRequest class are not the same.") pytest.fail(
"The parameters of the litellm.acompletion function and litellm.completion are not the same."
)
# test_acompletion_params() # test_acompletion_params()

View file

@ -1,9 +1,11 @@
import copy
import json import json
import sys import sys
import os import os
import io, asyncio import asyncio
import logging import logging
from unittest.mock import MagicMock, patch
logging.basicConfig(level=logging.DEBUG) logging.basicConfig(level=logging.DEBUG)
sys.path.insert(0, os.path.abspath("../..")) sys.path.insert(0, os.path.abspath("../.."))
@ -18,6 +20,21 @@ import time
import pytest import pytest
@pytest.fixture
def langfuse_client():
import langfuse
langfuse_client = langfuse.Langfuse(
public_key=os.environ["LANGFUSE_PUBLIC_KEY"],
secret_key=os.environ["LANGFUSE_SECRET_KEY"],
)
with patch(
"langfuse.Langfuse", MagicMock(return_value=langfuse_client)
) as mock_langfuse_client:
yield mock_langfuse_client()
def search_logs(log_file_path, num_good_logs=1): def search_logs(log_file_path, num_good_logs=1):
""" """
Searches the given log file for logs containing the "/api/public" string. Searches the given log file for logs containing the "/api/public" string.
@ -129,21 +146,10 @@ def test_langfuse_logging_async():
pytest.fail(f"An exception occurred - {e}") pytest.fail(f"An exception occurred - {e}")
async def make_async_calls(): async def make_async_calls(metadata=None, **completion_kwargs):
tasks = [] tasks = []
for _ in range(5): for _ in range(5):
task = asyncio.create_task( tasks.append(create_async_task())
litellm.acompletion(
model="azure/chatgpt-v-2",
messages=[{"role": "user", "content": "This is a test"}],
max_tokens=5,
temperature=0.7,
timeout=5,
user="langfuse_latency_test_user",
mock_response="It's simple to use and easy to get started",
)
)
tasks.append(task)
# Measure the start time before running the tasks # Measure the start time before running the tasks
start_time = asyncio.get_event_loop().time() start_time = asyncio.get_event_loop().time()
@ -161,9 +167,30 @@ async def make_async_calls():
return total_time return total_time
def create_async_task(**completion_kwargs):
"""
Creates an async task for the litellm.acompletion function.
This is just the task, but it is not run here.
To run the task it must be awaited or used in other asyncio coroutine execution functions like asyncio.gather.
Any kwargs passed to this function will be passed to the litellm.acompletion function.
By default a standard set of arguments are used for the litellm.acompletion function.
"""
completion_args = {
"model": "azure/chatgpt-v-2",
"messages": [{"role": "user", "content": "This is a test"}],
"max_tokens": 5,
"temperature": 0.7,
"timeout": 5,
"user": "langfuse_latency_test_user",
"mock_response": "It's simple to use and easy to get started",
}
completion_args.update(completion_kwargs)
return asyncio.create_task(litellm.acompletion(**completion_args))
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("stream", [False, True]) @pytest.mark.parametrize("stream", [False, True])
async def test_langfuse_logging_without_request_response(stream): async def test_langfuse_logging_without_request_response(stream, langfuse_client):
try: try:
import uuid import uuid
@ -171,12 +198,8 @@ async def test_langfuse_logging_without_request_response(stream):
litellm.set_verbose = True litellm.set_verbose = True
litellm.turn_off_message_logging = True litellm.turn_off_message_logging = True
litellm.success_callback = ["langfuse"] litellm.success_callback = ["langfuse"]
response = await litellm.acompletion( response = await create_async_task(
model="gpt-3.5-turbo", model="gpt-3.5-turbo",
mock_response="It's simple to use and easy to get started",
messages=[{"role": "user", "content": "Hi 👋 - i'm claude"}],
max_tokens=10,
temperature=0.2,
stream=stream, stream=stream,
metadata={"trace_id": _unique_trace_name}, metadata={"trace_id": _unique_trace_name},
) )
@ -185,14 +208,8 @@ async def test_langfuse_logging_without_request_response(stream):
async for chunk in response: async for chunk in response:
print(chunk) print(chunk)
await asyncio.sleep(3) langfuse_client.flush()
await asyncio.sleep(2)
import langfuse
langfuse_client = langfuse.Langfuse(
public_key=os.environ["LANGFUSE_PUBLIC_KEY"],
secret_key=os.environ["LANGFUSE_SECRET_KEY"],
)
# get trace with _unique_trace_name # get trace with _unique_trace_name
trace = langfuse_client.get_generations(trace_id=_unique_trace_name) trace = langfuse_client.get_generations(trace_id=_unique_trace_name)
@ -211,6 +228,123 @@ async def test_langfuse_logging_without_request_response(stream):
pytest.fail(f"An exception occurred - {e}") pytest.fail(f"An exception occurred - {e}")
@pytest.mark.asyncio
async def test_langfuse_logging_metadata(langfuse_client):
"""
Test that creates multiple traces, with a varying number of generations and sets various metadata fields
Confirms that no metadata that is standard within Langfuse is duplicated in the respective trace or generation metadata
For trace continuation certain metadata of the trace is overriden with metadata from the last generation based on the update_trace_keys field
Version is set for both the trace and the generation
Release is just set for the trace
Tags is just set for the trace
"""
import uuid
litellm.set_verbose = True
litellm.success_callback = ["langfuse"]
trace_identifiers = {}
expected_filtered_metadata_keys = {
"trace_name",
"trace_id",
"existing_trace_id",
"trace_user_id",
"session_id",
"tags",
"generation_name",
"generation_id",
"prompt",
}
trace_metadata = {
"trace_actual_metadata_key": "trace_actual_metadata_value"
} # Allows for setting the metadata on the trace
run_id = str(uuid.uuid4())
session_id = f"litellm-test-session-{run_id}"
trace_common_metadata = {
"session_id": session_id,
"tags": ["litellm-test-tag1", "litellm-test-tag2"],
"update_trace_keys": [
"output",
"trace_metadata",
], # Overwrite the following fields in the trace with the last generation's output and the trace_user_id
"trace_metadata": trace_metadata,
"gen_metadata_key": "gen_metadata_value", # Metadata key that should not be filtered in the generation
"trace_release": "litellm-test-release",
"version": "litellm-test-version",
}
for trace_num in range(1, 3): # Two traces
metadata = copy.deepcopy(trace_common_metadata)
trace_id = f"litellm-test-trace{trace_num}-{run_id}"
metadata["trace_id"] = trace_id
metadata["trace_name"] = trace_id
trace_identifiers[trace_id] = []
print(f"Trace: {trace_id}")
for generation_num in range(
1, trace_num + 1
): # Each trace has a number of generations equal to its trace number
metadata["trace_user_id"] = f"litellm-test-user{generation_num}-{run_id}"
generation_id = (
f"litellm-test-trace{trace_num}-generation-{generation_num}-{run_id}"
)
metadata["generation_id"] = generation_id
metadata["generation_name"] = generation_id
metadata["trace_metadata"][
"generation_id"
] = generation_id # Update to test if trace_metadata is overwritten by update trace keys
trace_identifiers[trace_id].append(generation_id)
print(f"Generation: {generation_id}")
response = await create_async_task(
model="gpt-3.5-turbo",
mock_response=f"{session_id}:{trace_id}:{generation_id}",
messages=[
{
"role": "user",
"content": f"{session_id}:{trace_id}:{generation_id}",
}
],
max_tokens=100,
temperature=0.2,
metadata=copy.deepcopy(
metadata
), # Every generation needs its own metadata, langfuse is not async/thread safe without it
)
print(response)
metadata["existing_trace_id"] = trace_id
langfuse_client.flush()
await asyncio.sleep(2)
# Tests the metadata filtering and the override of the output to be the last generation
for trace_id, generation_ids in trace_identifiers.items():
trace = langfuse_client.get_trace(id=trace_id)
assert trace.id == trace_id
assert trace.session_id == session_id
assert trace.metadata != trace_metadata
generations = list(
reversed(langfuse_client.get_generations(trace_id=trace_id).data)
)
assert len(generations) == len(generation_ids)
assert (
trace.input == generations[0].input
) # Should be set by the first generation
assert (
trace.output == generations[-1].output
) # Should be overwritten by the last generation according to update_trace_keys
assert (
trace.metadata != generations[-1].metadata
) # Should be overwritten by the last generation according to update_trace_keys
assert trace.metadata["generation_id"] == generations[-1].id
assert set(trace.tags).issuperset(trace_common_metadata["tags"])
print("trace_from_langfuse", trace)
for generation_id, generation in zip(generation_ids, generations):
assert generation.id == generation_id
assert generation.trace_id == trace_id
assert set(generation.metadata.keys()).isdisjoint(
expected_filtered_metadata_keys
)
print("generation_from_langfuse", generation)
@pytest.mark.skip(reason="beta test - checking langfuse output") @pytest.mark.skip(reason="beta test - checking langfuse output")
def test_langfuse_logging(): def test_langfuse_logging():
try: try:
@ -570,6 +704,10 @@ def test_langfuse_existing_trace_id():
assert initial_langfuse_trace_dict == new_langfuse_trace_dict assert initial_langfuse_trace_dict == new_langfuse_trace_dict
@pytest.mark.skipif(
condition=not os.environ.get("OPENAI_API_KEY", False),
reason="Authentication missing for openai",
)
def test_langfuse_logging_tool_calling(): def test_langfuse_logging_tool_calling():
litellm.set_verbose = True litellm.set_verbose = True

View file

@ -1,7 +1,7 @@
# What is this? # What is this?
## Tests slack alerting on proxy logging object ## Tests slack alerting on proxy logging object
import sys import sys, json
import os import os
import io, asyncio import io, asyncio
from datetime import datetime, timedelta from datetime import datetime, timedelta
@ -10,14 +10,18 @@ from datetime import datetime, timedelta
# logging.basicConfig(level=logging.DEBUG) # logging.basicConfig(level=logging.DEBUG)
sys.path.insert(0, os.path.abspath("../..")) sys.path.insert(0, os.path.abspath("../.."))
from litellm.proxy.utils import ProxyLogging from litellm.proxy.utils import ProxyLogging
from litellm.caching import DualCache from litellm.caching import DualCache, RedisCache
import litellm import litellm
import pytest import pytest
import asyncio import asyncio
from unittest.mock import patch, MagicMock from unittest.mock import patch, MagicMock
from litellm.utils import get_api_base from litellm.utils import get_api_base
from litellm.caching import DualCache from litellm.caching import DualCache
from litellm.integrations.slack_alerting import SlackAlerting from litellm.integrations.slack_alerting import SlackAlerting, DeploymentMetrics
import unittest.mock
from unittest.mock import AsyncMock
import pytest
from litellm.router import AlertingConfig, Router
@pytest.mark.parametrize( @pytest.mark.parametrize(
@ -61,7 +65,7 @@ async def test_get_api_base():
end_time = datetime.now() end_time = datetime.now()
time_difference_float, model, api_base, messages = ( time_difference_float, model, api_base, messages = (
_pl.slack_alerting_instance._response_taking_too_long_callback( _pl.slack_alerting_instance._response_taking_too_long_callback_helper(
kwargs={ kwargs={
"model": model, "model": model,
"messages": messages, "messages": messages,
@ -98,7 +102,10 @@ def mock_env(monkeypatch):
# Test the __init__ method # Test the __init__ method
def test_init(): def test_init():
slack_alerting = SlackAlerting( slack_alerting = SlackAlerting(
alerting_threshold=32, alerting=["slack"], alert_types=["llm_exceptions"] alerting_threshold=32,
alerting=["slack"],
alert_types=["llm_exceptions"],
internal_usage_cache=DualCache(),
) )
assert slack_alerting.alerting_threshold == 32 assert slack_alerting.alerting_threshold == 32
assert slack_alerting.alerting == ["slack"] assert slack_alerting.alerting == ["slack"]
@ -116,7 +123,7 @@ from datetime import datetime, timedelta
@pytest.fixture @pytest.fixture
def slack_alerting(): def slack_alerting():
return SlackAlerting(alerting_threshold=1) return SlackAlerting(alerting_threshold=1, internal_usage_cache=DualCache())
# Test for hanging LLM responses # Test for hanging LLM responses
@ -185,3 +192,170 @@ async def test_send_alert(slack_alerting):
mock_post.return_value.status_code = 200 mock_post.return_value.status_code = 200
await slack_alerting.send_alert("Test message", "Low", "budget_alerts") await slack_alerting.send_alert("Test message", "Low", "budget_alerts")
mock_post.assert_awaited_once() mock_post.assert_awaited_once()
@pytest.mark.asyncio
async def test_daily_reports_unit_test(slack_alerting):
with patch.object(slack_alerting, "send_alert", new=AsyncMock()) as mock_send_alert:
router = litellm.Router(
model_list=[
{
"model_name": "test-gpt",
"litellm_params": {"model": "gpt-3.5-turbo"},
"model_info": {"id": "1234"},
}
]
)
deployment_metrics = DeploymentMetrics(
id="1234",
failed_request=False,
latency_per_output_token=20.3,
updated_at=litellm.utils.get_utc_datetime(),
)
updated_val = await slack_alerting.async_update_daily_reports(
deployment_metrics=deployment_metrics
)
assert updated_val == 1
await slack_alerting.send_daily_reports(router=router)
mock_send_alert.assert_awaited_once()
@pytest.mark.asyncio
async def test_daily_reports_completion(slack_alerting):
with patch.object(slack_alerting, "send_alert", new=AsyncMock()) as mock_send_alert:
litellm.callbacks = [slack_alerting]
# on async success
router = litellm.Router(
model_list=[
{
"model_name": "gpt-5",
"litellm_params": {
"model": "gpt-3.5-turbo",
},
}
]
)
await router.acompletion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Hey, how's it going?"}],
)
await asyncio.sleep(3)
response_val = await slack_alerting.send_daily_reports(router=router)
assert response_val == True
mock_send_alert.assert_awaited_once()
# on async failure
router = litellm.Router(
model_list=[
{
"model_name": "gpt-5",
"litellm_params": {"model": "gpt-3.5-turbo", "api_key": "bad_key"},
}
]
)
try:
await router.acompletion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Hey, how's it going?"}],
)
except Exception as e:
pass
await asyncio.sleep(3)
response_val = await slack_alerting.send_daily_reports(router=router)
assert response_val == True
mock_send_alert.assert_awaited()
@pytest.mark.asyncio
async def test_daily_reports_redis_cache_scheduler():
redis_cache = RedisCache()
slack_alerting = SlackAlerting(
internal_usage_cache=DualCache(redis_cache=redis_cache)
)
router = litellm.Router(
model_list=[
{
"model_name": "gpt-5",
"litellm_params": {
"model": "gpt-3.5-turbo",
},
}
]
)
with patch.object(
slack_alerting, "send_alert", new=AsyncMock()
) as mock_send_alert, patch.object(
redis_cache, "async_set_cache", new=AsyncMock()
) as mock_redis_set_cache:
# initial call - expect empty
await slack_alerting._run_scheduler_helper(llm_router=router)
try:
json.dumps(mock_redis_set_cache.call_args[0][1])
except Exception as e:
pytest.fail(
"Cache value can't be json dumped - {}".format(
mock_redis_set_cache.call_args[0][1]
)
)
mock_redis_set_cache.assert_awaited_once()
# second call - expect empty
await slack_alerting._run_scheduler_helper(llm_router=router)
@pytest.mark.asyncio
@pytest.mark.skip(reason="Local test. Test if slack alerts are sent.")
async def test_send_llm_exception_to_slack():
from litellm.router import AlertingConfig
# on async success
router = litellm.Router(
model_list=[
{
"model_name": "gpt-3.5-turbo",
"litellm_params": {
"model": "gpt-3.5-turbo",
"api_key": "bad_key",
},
},
{
"model_name": "gpt-5-good",
"litellm_params": {
"model": "gpt-3.5-turbo",
},
},
],
alerting_config=AlertingConfig(
alerting_threshold=0.5, webhook_url=os.getenv("SLACK_WEBHOOK_URL")
),
)
try:
await router.acompletion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Hey, how's it going?"}],
)
except:
pass
await router.acompletion(
model="gpt-5-good",
messages=[{"role": "user", "content": "Hey, how's it going?"}],
)
await asyncio.sleep(3)

View file

@ -118,6 +118,7 @@ def test_completion_claude():
def test_completion_claude_3_empty_response(): def test_completion_claude_3_empty_response():
litellm.set_verbose = True litellm.set_verbose = True
messages = [ messages = [
{ {
"role": "system", "role": "system",
@ -2167,9 +2168,9 @@ def test_completion_replicate_vicuna():
def test_replicate_custom_prompt_dict(): def test_replicate_custom_prompt_dict():
litellm.set_verbose = True litellm.set_verbose = True
model_name = "replicate/meta/llama-2-70b-chat" model_name = "replicate/meta/llama-2-7b"
litellm.register_prompt_template( litellm.register_prompt_template(
model="replicate/meta/llama-2-70b-chat", model="replicate/meta/llama-2-7b",
initial_prompt_value="You are a good assistant", # [OPTIONAL] initial_prompt_value="You are a good assistant", # [OPTIONAL]
roles={ roles={
"system": { "system": {
@ -2199,6 +2200,7 @@ def test_replicate_custom_prompt_dict():
repetition_penalty=0.1, repetition_penalty=0.1,
num_retries=3, num_retries=3,
) )
except litellm.APIError as e: except litellm.APIError as e:
pass pass
except litellm.APIConnectionError as e: except litellm.APIConnectionError as e:
@ -3016,6 +3018,21 @@ async def test_acompletion_gemini():
pytest.fail(f"Error occurred: {e}") pytest.fail(f"Error occurred: {e}")
# Deepseek tests
def test_completion_deepseek():
litellm.set_verbose = True
model_name = "deepseek/deepseek-chat"
messages = [{"role": "user", "content": "Hey, how's it going?"}]
try:
response = completion(model=model_name, messages=messages)
# Add any assertions here to check the response
print(response)
except litellm.APIError as e:
pass
except Exception as e:
pytest.fail(f"Error occurred: {e}")
# Palm tests # Palm tests
def test_completion_palm(): def test_completion_palm():
litellm.set_verbose = True litellm.set_verbose = True

View file

@ -231,14 +231,17 @@ def test_cost_bedrock_pricing():
assert cost == predicted_cost assert cost == predicted_cost
@pytest.mark.skip(reason="AWS disabled our access")
def test_cost_bedrock_pricing_actual_calls(): def test_cost_bedrock_pricing_actual_calls():
litellm.set_verbose = True litellm.set_verbose = True
model = "anthropic.claude-instant-v1" model = "anthropic.claude-instant-v1"
messages = [{"role": "user", "content": "Hey, how's it going?"}] messages = [{"role": "user", "content": "Hey, how's it going?"}]
response = litellm.completion(model=model, messages=messages) response = litellm.completion(
assert response._hidden_params["region_name"] is not None model=model, messages=messages, mock_response="hello cool one"
)
print("response", response)
cost = litellm.completion_cost( cost = litellm.completion_cost(
model="bedrock/anthropic.claude-instant-v1",
completion_response=response, completion_response=response,
messages=[{"role": "user", "content": "Hey, how's it going?"}], messages=[{"role": "user", "content": "Hey, how's it going?"}],
) )

View file

@ -140,6 +140,8 @@ async def test_add_existing_deployment():
deployment_2.to_json(exclude_none=True), deployment_2.to_json(exclude_none=True),
] ]
) )
init_len_list = len(llm_router.model_list)
print(f"llm_router: {llm_router}") print(f"llm_router: {llm_router}")
master_key = "sk-1234" master_key = "sk-1234"
setattr(litellm.proxy.proxy_server, "llm_router", llm_router) setattr(litellm.proxy.proxy_server, "llm_router", llm_router)
@ -164,7 +166,7 @@ async def test_add_existing_deployment():
db_models = [db_model] db_models = [db_model]
num_added = pc._add_deployment(db_models=db_models) num_added = pc._add_deployment(db_models=db_models)
assert num_added == 0 assert init_len_list == len(llm_router.model_list)
litellm_params = LiteLLM_Params( litellm_params = LiteLLM_Params(

Some files were not shown because too many files have changed in this diff Show more