Merge branch 'main' into litellm_parallel_requests

This commit is contained in:
Krish Dholakia 2024-07-24 19:25:56 -07:00 committed by GitHub
commit e6963217ba
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
79 changed files with 3913 additions and 180 deletions

View file

@ -8,7 +8,7 @@
<img src="https://railway.app/button.svg" alt="Deploy on Railway">
</a>
</p>
<p align="center">Call all LLM APIs using the OpenAI format [Bedrock, Huggingface, VertexAI, TogetherAI, Azure, OpenAI, etc.]
<p align="center">Call all LLM APIs using the OpenAI format [Bedrock, Huggingface, VertexAI, TogetherAI, Azure, OpenAI, Groq etc.]
<br>
</p>
<h4 align="center"><a href="https://docs.litellm.ai/docs/simple_proxy" target="_blank">OpenAI Proxy Server</a> | <a href="https://docs.litellm.ai/docs/hosted" target="_blank"> Hosted Proxy (Preview)</a> | <a href="https://docs.litellm.ai/docs/enterprise"target="_blank">Enterprise Tier</a></h4>

View file

@ -9,13 +9,11 @@ services:
#########################################
## Uncomment these lines to start proxy with a config.yaml file ##
# volumes:
# - ./proxy_server_config.yaml:/app/config.yaml
# command: [ "--config", "./config.yaml", "--port", "4000"]
###############################################
ports:
- "4000:4000" # Map the container port to the host, change the host port if necessary
environment:
DATABASE_URL: "postgresql://postgres:example@db:5432/postgres"
DATABASE_URL: "postgresql://llmproxy:dbpassword9090@db:5432/litellm"
STORE_MODEL_IN_DB: "True" # allows adding models to proxy via UI
env_file:
- .env # Load local .env file
@ -25,11 +23,31 @@ services:
image: postgres
restart: always
environment:
POSTGRES_PASSWORD: example
POSTGRES_DB: litellm
POSTGRES_USER: llmproxy
POSTGRES_PASSWORD: dbpassword9090
healthcheck:
test: ["CMD-SHELL", "pg_isready"]
test: ["CMD-SHELL", "pg_isready -d litellm -U llmproxy"]
interval: 1s
timeout: 5s
retries: 10
prometheus:
image: prom/prometheus
volumes:
- prometheus_data:/prometheus
- ./prometheus.yml:/etc/prometheus/prometheus.yml
ports:
- "9090:9090"
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--storage.tsdb.retention.time=15d'
restart: always
volumes:
prometheus_data:
driver: local
# ...rest of your docker-compose config if any

View file

@ -0,0 +1,72 @@
import Image from '@theme/IdealImage';
# 🔥 Arize AI - Logging LLM Input/Output
AI Observability and Evaluation Platform
:::tip
This is community maintained, Please make an issue if you run into a bug
https://github.com/BerriAI/litellm
:::
## Pre-Requisites
Make an account on [Arize AI](https://app.arize.com/auth/login)
## Quick Start
Use just 2 lines of code, to instantly log your responses **across all providers** with arize
```python
litellm.callbacks = ["arize"]
```
```python
import litellm
import os
os.environ["ARIZE_SPACE_KEY"] = ""
os.environ["ARIZE_API_KEY"] = "" # defaults to litellm-completion
# LLM API Keys
os.environ['OPENAI_API_KEY']=""
# set arize as a callback, litellm will send the data to arize
litellm.callbacks = ["arize"]
# openai call
response = litellm.completion(
model="gpt-3.5-turbo",
messages=[
{"role": "user", "content": "Hi 👋 - i'm openai"}
]
)
```
### Using with LiteLLM Proxy
```yaml
model_list:
- model_name: gpt-4
litellm_params:
model: openai/fake
api_key: fake-key
api_base: https://exampleopenaiendpoint-production.up.railway.app/
litellm_settings:
callbacks: ["arize"]
environment_variables:
ARIZE_SPACE_KEY: "d0*****"
ARIZE_API_KEY: "141a****"
```
## Support & Talk to Founders
- [Schedule Demo 👋](https://calendly.com/d/4mp-gd3-k5k/berriai-1-1-onboarding-litellm-hosted-version)
- [Community Discord 💭](https://discord.gg/wuPM9dRgDw)
- Our numbers 📞 +1 (770) 8783-106 / +1 (412) 618-6238
- Our emails ✉️ ishaan@berri.ai / krrish@berri.ai

View file

@ -0,0 +1,147 @@
import Image from '@theme/IdealImage';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# ⚡️ Braintrust - Evals + Logging
[Braintrust](https://www.braintrust.dev/) manages evaluations, logging, prompt playground, to data management for AI products.
## Quick Start
```python
# pip install langfuse
import litellm
import os
# set env
os.environ["BRAINTRUST_API_KEY"] = ""
os.environ['OPENAI_API_KEY']=""
# set braintrust as a callback, litellm will send the data to braintrust
litellm.callbacks = ["braintrust"]
# openai call
response = litellm.completion(
model="gpt-3.5-turbo",
messages=[
{"role": "user", "content": "Hi 👋 - i'm openai"}
]
)
```
## OpenAI Proxy Usage
1. Add keys to env
```env
BRAINTRUST_API_KEY=""
```
2. Add braintrust to callbacks
```yaml
model_list:
- model_name: gpt-3.5-turbo
litellm_params:
model: gpt-3.5-turbo
api_key: os.environ/OPENAI_API_KEY
litellm_settings:
callbacks: ["braintrust"]
```
3. Test it!
```bash
curl -X POST 'http://0.0.0.0:4000/chat/completions' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234' \
-D '{
"model": "groq-llama3",
"messages": [
{ "role": "system", "content": "Use your tools smartly"},
{ "role": "user", "content": "What time is it now? Use your tool"}
]
}'
```
## Advanced - pass Project ID
<Tabs>
<TabItem value="sdk" label="SDK">
```python
response = litellm.completion(
model="gpt-3.5-turbo",
messages=[
{"role": "user", "content": "Hi 👋 - i'm openai"}
],
metadata={
"project_id": "my-special-project"
}
)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
**Curl**
```bash
curl -X POST 'http://0.0.0.0:4000/chat/completions' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234' \
-D '{
"model": "groq-llama3",
"messages": [
{ "role": "system", "content": "Use your tools smartly"},
{ "role": "user", "content": "What time is it now? Use your tool"}
],
"metadata": {
"project_id": "my-special-project"
}
}'
```
**OpenAI SDK**
```python
import openai
client = openai.OpenAI(
api_key="anything",
base_url="http://0.0.0.0:4000"
)
# request sent to model set on litellm proxy, `litellm --model`
response = client.chat.completions.create(
model="gpt-3.5-turbo",
messages = [
{
"role": "user",
"content": "this is a test request, write a short poem"
}
],
extra_body={ # pass in any provider-specific param, if not supported by openai, https://docs.litellm.ai/docs/completion/input#provider-specific-params
"metadata": { # 👈 use for logging additional params (e.g. to langfuse)
"project_id": "my-special-project"
}
}
)
print(response)
```
For more examples, [**Click Here**](../proxy/user_keys.md#chatcompletions)
</TabItem>
</Tabs>
## Full API Spec
Here's everything you can pass in metadata for a braintrust request
`braintrust_*` - any metadata field starting with `braintrust_` will be passed as metadata to the logging request
`project_id` - set the project id for a braintrust call. Default is `litellm`.

View file

@ -1,4 +1,4 @@
# 🧠 Helicone - OSS LLM Observability Platform
# 🧊 Helicone - OSS LLM Observability Platform
:::tip

View file

@ -1,6 +1,6 @@
import Image from '@theme/IdealImage';
# Langsmith - Logging LLM Input/Output
# 🦜 Langsmith - Logging LLM Input/Output
:::tip
@ -56,7 +56,7 @@ response = litellm.completion(
```
## Advanced
### Set Custom Project & Run names
### Set Langsmith fields - Custom Projec, Run names, tags
```python
import litellm
@ -77,6 +77,7 @@ response = litellm.completion(
metadata={
"run_name": "litellmRUN", # langsmith run name
"project_name": "litellm-completion", # langsmith project name
"tags": ["model1", "prod-2"] # tags to log on langsmith
}
)
print(response)

View file

@ -1,10 +1,16 @@
import Image from '@theme/IdealImage';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Raw Request/Response Logging
## Logging
See the raw request/response sent by LiteLLM in your logging provider (OTEL/Langfuse/etc.).
**on SDK**
<Tabs>
<TabItem value="sdk" label="SDK">
```python
# pip install langfuse
import litellm
@ -34,13 +40,85 @@ response = litellm.completion(
)
```
**on Proxy**
</TabItem>
<TabItem value="proxy" label="PROXY">
```yaml
litellm_settings:
log_raw_request_response: True
```
</TabItem>
</Tabs>
**Expected Log**
<Image img={require('../../img/raw_request_log.png')}/>
## Return Raw Response Headers
Return raw response headers from llm provider.
Currently only supported for openai.
<Tabs>
<TabItem value="sdk" label="SDK">
```python
import litellm
import os
litellm.return_response_headers = True
## set ENV variables
os.environ["OPENAI_API_KEY"] = "your-api-key"
response = litellm.completion(
model="gpt-3.5-turbo",
messages=[{ "content": "Hello, how are you?","role": "user"}]
)
print(response._hidden_params)
```
</TabItem>
<TabItem value="proxy" label="PROXY">
1. Setup config.yaml
```yaml
model_list:
- model_name: gpt-3.5-turbo
litellm_params:
model: gpt-3.5-turbo
api_key: os.environ/GROQ_API_KEY
litellm_settings:
return_response_headers: true
```
2. Test it!
```bash
curl -X POST 'http://0.0.0.0:4000/chat/completions' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234' \
-D '{
"model": "gpt-3.5-turbo",
"messages": [
{ "role": "system", "content": "Use your tools smartly"},
{ "role": "user", "content": "What time is it now? Use your tool"}
]
}'
```
</TabItem>
</Tabs>
**Expected Response**
<Image img={require('../../img/raw_response_headers.png')}/>

View file

@ -0,0 +1,223 @@
# OpenID Connect (OIDC)
LiteLLM supports using OpenID Connect (OIDC) for authentication to upstream services . This allows you to avoid storing sensitive credentials in your configuration files.
## OIDC Identity Provider (IdP)
LiteLLM supports the following OIDC identity providers:
| Provider | Config Name | Custom Audiences |
| -------------------------| ------------ | ---------------- |
| Google Cloud Run | `google` | Yes |
| CircleCI v1 | `circleci` | No |
| CircleCI v2 | `circleci_v2`| No |
| GitHub Actions | `github` | Yes |
| Azure Kubernetes Service | `azure` | No |
If you would like to use a different OIDC provider, please open an issue on GitHub.
## OIDC Connect Relying Party (RP)
LiteLLM supports the following OIDC relying parties / clients:
- Amazon Bedrock
- Azure OpenAI
- _(Coming soon) Google Cloud Vertex AI_
### Configuring OIDC
Wherever a secret key can be used, OIDC can be used in-place. The general format is:
```
oidc/config_name_here/audience_here
```
For providers that do not use the `audience` parameter, you can (and should) omit it:
```
oidc/config_name_here/
```
## Examples
### Google Cloud Run -> Amazon Bedrock
```yaml
model_list:
- model_name: claude-3-haiku-20240307
litellm_params:
model: bedrock/anthropic.claude-3-haiku-20240307-v1:0
aws_region_name: us-west-2
aws_session_name: "litellm"
aws_role_name: "arn:aws:iam::YOUR_THING_HERE:role/litellm-google-demo"
aws_web_identity_token: "oidc/google/https://example.com"
```
### CircleCI v2 -> Amazon Bedrock
```yaml
model_list:
- model_name: command-r
litellm_params:
model: bedrock/cohere.command-r-v1:0
aws_region_name: us-west-2
aws_session_name: "my-test-session"
aws_role_name: "arn:aws:iam::335785316107:role/litellm-github-unit-tests-circleci"
aws_web_identity_token: "oidc/circleci_v2/"
```
#### Amazon IAM Role Configuration for CircleCI v2 -> Bedrock
The configuration below is only an example. You should adjust the permissions and trust relationship to match your specific use case.
Permissions:
```json
{
"Version": "2012-10-17",
"Statement": [
{
"Sid": "VisualEditor0",
"Effect": "Allow",
"Action": [
"bedrock:InvokeModel",
"bedrock:InvokeModelWithResponseStream"
],
"Resource": [
"arn:aws:bedrock:*::foundation-model/anthropic.claude-3-haiku-20240307-v1:0",
"arn:aws:bedrock:*::foundation-model/cohere.command-r-v1:0"
]
}
]
}
```
See https://docs.aws.amazon.com/bedrock/latest/userguide/security_iam_id-based-policy-examples.html for more examples.
Trust Relationship:
```json
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Principal": {
"Federated": "arn:aws:iam::335785316107:oidc-provider/oidc.circleci.com/org/c5a99188-154f-4f69-8da2-b442b1bf78dd"
},
"Action": "sts:AssumeRoleWithWebIdentity",
"Condition": {
"StringEquals": {
"oidc.circleci.com/org/c5a99188-154f-4f69-8da2-b442b1bf78dd:aud": "c5a99188-154f-4f69-8da2-b442b1bf78dd"
},
"ForAnyValue:StringLike": {
"oidc.circleci.com/org/c5a99188-154f-4f69-8da2-b442b1bf78dd:sub": [
"org/c5a99188-154f-4f69-8da2-b442b1bf78dd/project/*/user/*/vcs-origin/github.com/BerriAI/litellm/vcs-ref/refs/heads/main",
"org/c5a99188-154f-4f69-8da2-b442b1bf78dd/project/*/user/*/vcs-origin/github.com/BerriAI/litellm/vcs-ref/refs/heads/litellm_*"
]
}
}
}
]
}
```
This trust relationship restricts CircleCI to only assume the role on the main branch and branches that start with `litellm_`.
For CircleCI (v1 and v2), you also need to add your organization's OIDC provider in your AWS IAM settings. See https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles_create_for-idp_oidc.html for more information.
:::tip
You should _never_ need to create an IAM user. If you did, you're not using OIDC correctly. You should only be creating a role with permissions and a trust relationship to your OIDC provider.
:::
### Google Cloud Run -> Azure OpenAI
```yaml
model_list:
- model_name: gpt-4o-2024-05-13
litellm_params:
model: azure/gpt-4o-2024-05-13
azure_ad_token: "oidc/google/https://example.com"
api_version: "2024-06-01"
api_base: "https://demo-here.openai.azure.com"
model_info:
base_model: azure/gpt-4o-2024-05-13
```
For Azure OpenAI, you need to define `AZURE_CLIENT_ID`, `AZURE_TENANT_ID`, and optionally `AZURE_AUTHORITY_HOST` in your environment.
```bash
export AZURE_CLIENT_ID="91a43c21-cf21-4f34-9085-331015ea4f91" # Azure AD Application (Client) ID
export AZURE_TENANT_ID="f3b1cf79-eba8-40c3-8120-cb26aca169c2" # Will be the same across of all your Azure AD applications
export AZURE_AUTHORITY_HOST="https://login.microsoftonline.com" # 👈 Optional, defaults to "https://login.microsoftonline.com"
```
:::tip
You can find `AZURE_CLIENT_ID` by visiting `https://login.microsoftonline.com/YOUR_DOMAIN_HERE/v2.0/.well-known/openid-configuration` and looking for the UUID in the `issuer` field.
:::
:::tip
Don't set `AZURE_AUTHORITY_HOST` in your environment unless you need to override the default value. This way, if the default value changes in the future, you won't need to update your environment.
:::
:::tip
By default, Azure AD applications use the audience `api://AzureADTokenExchange`. We recommend setting the audience to something more specific to your application.
:::
#### Azure AD Application Configuration
Unfortunately, Azure is bit more complicated to set up than other OIDC relying parties like AWS. Basically, you have to:
1. Create an Azure application.
2. Add a federated credential for the OIDC IdP you're using (e.g. Google Cloud Run).
3. Add the Azure application to resource group that contains the Azure OpenAI resource(s).
4. Give the Azure application the necessary role to access the Azure OpenAI resource(s).
The custom role below is the recommended minimum permissions for the Azure application to access Azure OpenAI resources. You should adjust the permissions to match your specific use case.
```json
{
"id": "/subscriptions/24ebb700-ec2f-417f-afad-78fe15dcc91f/providers/Microsoft.Authorization/roleDefinitions/baf42808-99ff-466d-b9da-f95bb0422c5f",
"properties": {
"roleName": "invoke-only",
"description": "",
"assignableScopes": [
"/subscriptions/24ebb700-ec2f-417f-afad-78fe15dcc91f/resourceGroups/your-openai-group-name"
],
"permissions": [
{
"actions": [],
"notActions": [],
"dataActions": [
"Microsoft.CognitiveServices/accounts/OpenAI/deployments/audio/action",
"Microsoft.CognitiveServices/accounts/OpenAI/deployments/search/action",
"Microsoft.CognitiveServices/accounts/OpenAI/deployments/completions/action",
"Microsoft.CognitiveServices/accounts/OpenAI/deployments/chat/completions/action",
"Microsoft.CognitiveServices/accounts/OpenAI/deployments/extensions/chat/completions/action",
"Microsoft.CognitiveServices/accounts/OpenAI/deployments/embeddings/action",
"Microsoft.CognitiveServices/accounts/OpenAI/images/generations/action"
],
"notDataActions": []
}
]
}
}
```
_Note: Your UUIDs will be different._
Please contact us for paid enterprise support if you need help setting up Azure AD applications.

View file

@ -56,7 +56,7 @@ for chunk in response:
print(chunk["choices"][0]["delta"]["content"]) # same as openai format
```
## OpenAI Proxy Usage
## Usage with LiteLLM Proxy
Here's how to call Anthropic with the LiteLLM Proxy Server
@ -69,14 +69,6 @@ export ANTHROPIC_API_KEY="your-api-key"
### 2. Start the proxy
<Tabs>
<TabItem value="cli" label="cli">
```bash
$ litellm --model claude-3-opus-20240229
# Server running on http://0.0.0.0:4000
```
</TabItem>
<TabItem value="config" label="config.yaml">
```yaml
@ -91,6 +83,14 @@ model_list:
litellm --config /path/to/config.yaml
```
</TabItem>
<TabItem value="cli" label="cli">
```bash
$ litellm --model claude-3-opus-20240229
# Server running on http://0.0.0.0:4000
```
</TabItem>
</Tabs>
### 3. Test it

View file

@ -0,0 +1,60 @@
# FriendliAI
https://suite.friendli.ai/
**We support ALL FriendliAI models, just set `friendliai/` as a prefix when sending completion requests**
## API Key
```python
# env variable
os.environ['FRIENDLI_TOKEN']
os.environ['FRIENDLI_API_BASE'] # Optional. Set this when using dedicated endpoint.
```
## Sample Usage
```python
from litellm import completion
import os
os.environ['FRIENDLI_TOKEN'] = ""
response = completion(
model="friendliai/mixtral-8x7b-instruct-v0-1",
messages=[
{"role": "user", "content": "hello from litellm"}
],
)
print(response)
```
## Sample Usage - Streaming
```python
from litellm import completion
import os
os.environ['FRIENDLI_TOKEN'] = ""
response = completion(
model="friendliai/mixtral-8x7b-instruct-v0-1",
messages=[
{"role": "user", "content": "hello from litellm"}
],
stream=True
)
for chunk in response:
print(chunk)
```
## Supported Models
### Serverless Endpoints
We support ALL FriendliAI AI models, just set `friendliai/` as a prefix when sending completion requests
| Model Name | Function Call |
|--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| mixtral-8x7b-instruct | `completion(model="friendliai/mixtral-8x7b-instruct-v0-1", messages)` |
| meta-llama-3-8b-instruct | `completion(model="friendliai/meta-llama-3-8b-instruct", messages)` |
| meta-llama-3-70b-instruct | `completion(model="friendliai/meta-llama-3-70b-instruct", messages)` |
### Dedicated Endpoints
```
model="friendliai/$ENDPOINT_ID:$ADAPTER_ROUTE"
```

View file

@ -1,3 +1,6 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# Groq
https://groq.com/
@ -20,7 +23,7 @@ import os
os.environ['GROQ_API_KEY'] = ""
response = completion(
model="groq/llama2-70b-4096",
model="groq/llama3-8b-8192",
messages=[
{"role": "user", "content": "hello from litellm"}
],
@ -35,7 +38,7 @@ import os
os.environ['GROQ_API_KEY'] = ""
response = completion(
model="groq/llama2-70b-4096",
model="groq/llama3-8b-8192",
messages=[
{"role": "user", "content": "hello from litellm"}
],
@ -47,6 +50,101 @@ for chunk in response:
```
## Usage with LiteLLM Proxy
### 1. Set Groq Models on config.yaml
```yaml
model_list:
- model_name: groq-llama3-8b-8192 # Model Alias to use for requests
litellm_params:
model: groq/llama3-8b-8192
api_key: "os.environ/GROQ_API_KEY" # ensure you have `GROQ_API_KEY` in your .env
```
### 2. Start Proxy
```
litellm --config config.yaml
```
### 3. Test it
Make request to litellm proxy
<Tabs>
<TabItem value="Curl" label="Curl Request">
```shell
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "groq-llama3-8b-8192",
"messages": [
{
"role": "user",
"content": "what llm are you"
}
]
}
'
```
</TabItem>
<TabItem value="openai" label="OpenAI v1.0.0+">
```python
import openai
client = openai.OpenAI(
api_key="anything",
base_url="http://0.0.0.0:4000"
)
response = client.chat.completions.create(model="groq-llama3-8b-8192", messages = [
{
"role": "user",
"content": "this is a test request, write a short poem"
}
])
print(response)
```
</TabItem>
<TabItem value="langchain" label="Langchain">
```python
from langchain.chat_models import ChatOpenAI
from langchain.prompts.chat import (
ChatPromptTemplate,
HumanMessagePromptTemplate,
SystemMessagePromptTemplate,
)
from langchain.schema import HumanMessage, SystemMessage
chat = ChatOpenAI(
openai_api_base="http://0.0.0.0:4000", # set openai_api_base to the LiteLLM Proxy
model = "groq-llama3-8b-8192",
temperature=0.1
)
messages = [
SystemMessage(
content="You are a helpful assistant that im using to make a test request to."
),
HumanMessage(
content="test from litellm. tell me why it's amazing in 1 sentence"
),
]
response = chat(messages)
print(response)
```
</TabItem>
</Tabs>
## Supported Models - ALL Groq Models Supported!
We support ALL Groq models, just set `groq/` as a prefix when sending completion requests
@ -114,7 +212,7 @@ tools = [
}
]
response = litellm.completion(
model="groq/llama2-70b-4096",
model="groq/llama3-8b-8192",
messages=messages,
tools=tools,
tool_choice="auto", # auto is default, but we'll be explicit
@ -154,7 +252,7 @@ if tool_calls:
) # extend conversation with function response
print(f"messages: {messages}")
second_response = litellm.completion(
model="groq/llama2-70b-4096", messages=messages
model="groq/llama3-8b-8192", messages=messages
) # get a new response from the model where it can see the function response
print("second response\n", second_response)
```

View file

@ -749,6 +749,85 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
</TabItem>
</Tabs>
## Llama 3 API
| Model Name | Function Call |
|------------------|--------------------------------------|
| meta/llama3-405b-instruct-maas | `completion('vertex_ai/meta/llama3-405b-instruct-maas', messages)` |
### Usage
<Tabs>
<TabItem value="sdk" label="SDK">
```python
from litellm import completion
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = ""
model = "meta/llama3-405b-instruct-maas"
vertex_ai_project = "your-vertex-project" # can also set this as os.environ["VERTEXAI_PROJECT"]
vertex_ai_location = "your-vertex-location" # can also set this as os.environ["VERTEXAI_LOCATION"]
response = completion(
model="vertex_ai/" + model,
messages=[{"role": "user", "content": "hi"}],
temperature=0.7,
vertex_ai_project=vertex_ai_project,
vertex_ai_location=vertex_ai_location,
)
print("\nModel Response", response)
```
</TabItem>
<TabItem value="proxy" label="Proxy">
**1. Add to config**
```yaml
model_list:
- model_name: anthropic-llama
litellm_params:
model: vertex_ai/meta/llama3-405b-instruct-maas
vertex_ai_project: "my-test-project"
vertex_ai_location: "us-east-1"
- model_name: anthropic-llama
litellm_params:
model: vertex_ai/meta/llama3-405b-instruct-maas
vertex_ai_project: "my-test-project"
vertex_ai_location: "us-west-1"
```
**2. Start proxy**
```bash
litellm --config /path/to/config.yaml
# RUNNING at http://0.0.0.0:4000
```
**3. Test it!**
```bash
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Authorization: Bearer sk-1234' \
--header 'Content-Type: application/json' \
--data '{
"model": "anthropic-llama", # 👈 the 'model_name' in config
"messages": [
{
"role": "user",
"content": "what llm are you"
}
],
}'
```
</TabItem>
</Tabs>
## Model Garden
| Model Name | Function Call |
|------------------|--------------------------------------|

View file

@ -119,8 +119,8 @@ All Possible Alert Types
```python
AlertType = Literal[
"llm_exceptions",
"llm_too_slow",
"llm_exceptions", # LLM API Exceptions
"llm_too_slow", # LLM Responses slower than alerting_threshold
"llm_requests_hanging",
"budget_alerts",
"db_exceptions",
@ -133,6 +133,61 @@ AlertType = Literal[
```
## Advanced - set specific slack channels per alert type
Use this if you want to set specific channels per alert type
**This allows you to do the following**
```
llm_exceptions -> go to slack channel #llm-exceptions
spend_reports -> go to slack channel #llm-spend-reports
```
Set `alert_to_webhook_url` on your config.yaml
```yaml
model_list:
- model_name: gpt-4
litellm_params:
model: openai/fake
api_key: fake-key
api_base: https://exampleopenaiendpoint-production.up.railway.app/
general_settings:
master_key: sk-1234
alerting: ["slack"]
alerting_threshold: 0.0001 # (Seconds) set an artifically low threshold for testing alerting
alert_to_webhook_url: {
"llm_exceptions": "https://hooks.slack.com/services/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",
"llm_too_slow": "https://hooks.slack.com/services/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",
"llm_requests_hanging": "https://hooks.slack.com/services/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",
"budget_alerts": "https://hooks.slack.com/services/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",
"db_exceptions": "https://hooks.slack.com/services/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",
"daily_reports": "https://hooks.slack.com/services/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",
"spend_reports": "https://hooks.slack.com/services/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",
"cooldown_deployment": "https://hooks.slack.com/services/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",
"new_model_added": "https://hooks.slack.com/services/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",
"outage_alerts": "https://hooks.slack.com/services/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",
}
litellm_settings:
success_callback: ["langfuse"]
```
Test it - send a valid llm request - expect to see a `llm_too_slow` alert in it's own slack channel
```shell
curl -i http://localhost:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-1234" \
-d '{
"model": "gpt-4",
"messages": [
{"role": "user", "content": "Hello, Claude gm!"}
]
}'
```
## Advanced - Using MS Teams Webhooks

View file

@ -266,6 +266,54 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
}'
```
## Disable team from turning on/off guardrails
### 1. Disable team from modifying guardrails
```bash
curl -X POST 'http://0.0.0.0:4000/team/update' \
-H 'Authorization: Bearer sk-1234' \
-H 'Content-Type: application/json' \
-D '{
"team_id": "4198d93c-d375-4c83-8d5a-71e7c5473e50",
"metadata": {"guardrails": {"modify_guardrails": false}}
}'
```
### 2. Try to disable guardrails for a call
```bash
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--header 'Authorization: Bearer $LITELLM_VIRTUAL_KEY' \
--data '{
"model": "gpt-3.5-turbo",
"messages": [
{
"role": "user",
"content": "Think of 10 random colors."
}
],
"metadata": {"guardrails": {"hide_secrets": false}}
}'
```
### 3. Get 403 Error
```
{
"error": {
"message": {
"error": "Your team does not have permission to modify guardrails."
},
"type": "auth_error",
"param": "None",
"code": 403
}
}
```
Expect to NOT see `+1 412-612-9992` in your server logs on your callback.
:::info

View file

@ -48,6 +48,20 @@ A number of these headers could be useful for troubleshooting, but the
`x-litellm-call-id` is the one that is most useful for tracking a request across
components in your system, including in logging tools.
## Redacting UserAPIKeyInfo
Redact information about the user api key (hashed token, user_id, team id, etc.), from logs.
Currently supported for Langfuse, OpenTelemetry, Logfire, ArizeAI logging.
```yaml
litellm_settings:
callbacks: ["langfuse"]
redact_user_api_key_info: true
```
Removes any field with `user_api_key_*` from metadata.
## Logging Proxy Input/Output - Langfuse
We will use the `--config` to set `litellm.success_callback = ["langfuse"]` this will log all successfull LLM calls to langfuse. Make sure to set `LANGFUSE_PUBLIC_KEY` and `LANGFUSE_SECRET_KEY` in your environment
@ -202,6 +216,9 @@ print(response)
### Team based Logging to Langfuse
[👉 Tutorial - Allow each team to use their own Langfuse Project / custom callbacks](team_logging)
<!--
**Example:**
This config would send langfuse logs to 2 different langfuse projects, based on the team id
@ -228,7 +245,7 @@ curl -X POST 'http://0.0.0.0:4000/key/generate' \
-d '{"team_id": "ishaans-secret-project"}'
```
All requests made with these keys will log data to their team-specific logging.
All requests made with these keys will log data to their team-specific logging. -->
### Redacting Messages, Response Content from Langfuse Logging
@ -1106,6 +1123,52 @@ environment_variables:
```
2. Start Proxy
```
litellm --config /path/to/config.yaml
```
3. Test it!
```bash
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "fake-openai-endpoint",
"messages": [
{
"role": "user",
"content": "Hello, Claude gm!"
}
],
}
'
```
Expect to see your log on Langfuse
<Image img={require('../../img/langsmith_new.png')} />
## Logging LLM IO to Arize AI
1. Set `success_callback: ["arize"]` on litellm config.yaml
```yaml
model_list:
- model_name: gpt-4
litellm_params:
model: openai/fake
api_key: fake-key
api_base: https://exampleopenaiendpoint-production.up.railway.app/
litellm_settings:
callbacks: ["arize"]
environment_variables:
ARIZE_SPACE_KEY: "d0*****"
ARIZE_API_KEY: "141a****"
```
2. Start Proxy
```

View file

@ -70,3 +70,42 @@ curl --location 'http://0.0.0.0:4000/v1/chat/completions' \
"user": "usha"
}'
```
## Team Based Logging
[👉 Tutorial - Allow each team to use their own Langfuse Project / custom callbacks](team_logging.md)
<!--
## Logging / Caching
Turn on/off logging and caching for a specific team id.
**Example:**
This config would send langfuse logs to 2 different langfuse projects, based on the team id
```yaml
litellm_settings:
default_team_settings:
- team_id: my-secret-project
success_callback: ["langfuse"]
langfuse_public_key: os.environ/LANGFUSE_PUB_KEY_1 # Project 1
langfuse_secret: os.environ/LANGFUSE_PRIVATE_KEY_1 # Project 1
- team_id: ishaans-secret-project
success_callback: ["langfuse"]
langfuse_public_key: os.environ/LANGFUSE_PUB_KEY_2 # Project 2
langfuse_secret: os.environ/LANGFUSE_SECRET_2 # Project 2
```
Now, when you [generate keys](./virtual_keys.md) for this team-id
```bash
curl -X POST 'http://0.0.0.0:4000/key/generate' \
-H 'Authorization: Bearer sk-1234' \
-H 'Content-Type: application/json' \
-d '{"team_id": "ishaans-secret-project"}'
```
All requests made with these keys will log data to their team-specific logging. -->

View file

@ -0,0 +1,144 @@
import Image from '@theme/IdealImage';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# 👥📊 Team Based Logging
Allow each team to use their own Langfuse Project / custom callbacks
**This allows you to do the following**
```
Team 1 -> Logs to Langfuse Project 1
Team 2 -> Logs to Langfuse Project 2
Team 3 -> Disabled Logging (for GDPR compliance)
```
## Set Callbacks Per Team
### 1. Set callback for team
We make a request to `POST /team/{team_id}/callback` to add a callback for
```shell
curl -X POST 'http:/localhost:4000/team/dbe2f686-a686-4896-864a-4c3924458709/callback' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234' \
-d '{
"callback_name": "langfuse",
"callback_type": "success",
"callback_vars": {
"langfuse_public_key": "pk",
"langfuse_secret_key": "sk_",
"langfuse_host": "https://cloud.langfuse.com"
}
}'
```
#### Supported Values
| Field | Supported Values | Notes |
|-------|------------------|-------|
| `callback_name` | `"langfuse"` | Currently only supports "langfuse" |
| `callback_type` | `"success"`, `"failure"`, `"success_and_failure"` | |
| `callback_vars` | | dict of callback settings |
| &nbsp;&nbsp;&nbsp;&nbsp;`langfuse_public_key` | string | Required |
| &nbsp;&nbsp;&nbsp;&nbsp;`langfuse_secret_key` | string | Required |
| &nbsp;&nbsp;&nbsp;&nbsp;`langfuse_host` | string | Optional (defaults to https://cloud.langfuse.com) |
### 2. Create key for team
All keys created for team `dbe2f686-a686-4896-864a-4c3924458709` will log to langfuse project specified on [Step 1. Set callback for team](#1-set-callback-for-team)
```shell
curl --location 'http://0.0.0.0:4000/key/generate' \
--header 'Authorization: Bearer sk-1234' \
--header 'Content-Type: application/json' \
--data '{
"team_id": "dbe2f686-a686-4896-864a-4c3924458709"
}'
```
### 3. Make `/chat/completion` request for team
```shell
curl -i http://localhost:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-KbUuE0WNptC0jXapyMmLBA" \
-d '{
"model": "gpt-4",
"messages": [
{"role": "user", "content": "Hello, Claude gm!"}
]
}'
```
Expect this to be logged on the langfuse project specified on [Step 1. Set callback for team](#1-set-callback-for-team)
## Disable Logging for a Team
To disable logging for a specific team, you can use the following endpoint:
`POST /team/{team_id}/disable_logging`
This endpoint removes all success and failure callbacks for the specified team, effectively disabling logging.
### Step 1. Disable logging for team
```shell
curl -X POST 'http://localhost:4000/team/YOUR_TEAM_ID/disable_logging' \
-H 'Authorization: Bearer YOUR_API_KEY'
```
Replace YOUR_TEAM_ID with the actual team ID
**Response**
A successful request will return a response similar to this:
```json
{
"status": "success",
"message": "Logging disabled for team YOUR_TEAM_ID",
"data": {
"team_id": "YOUR_TEAM_ID",
"success_callbacks": [],
"failure_callbacks": []
}
}
```
### Step 2. Test it - `/chat/completions`
Use a key generated for team = `team_id` - you should see no logs on your configured success callback (eg. Langfuse)
```shell
curl -i http://localhost:4000/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer sk-KbUuE0WNptC0jXapyMmLBA" \
-d '{
"model": "gpt-4",
"messages": [
{"role": "user", "content": "Hello, Claude gm!"}
]
}'
```
### Debugging / Troubleshooting
- Check active callbacks for team using `GET /team/{team_id}/callback`
Use this to check what success/failure callbacks are active for team=`team_id`
```shell
curl -X GET 'http://localhost:4000/team/dbe2f686-a686-4896-864a-4c3924458709/callback' \
-H 'Authorization: Bearer sk-1234'
```
## Team Logging Endpoints
- [`POST /team/{team_id}/callback` Add a success/failure callback to a team](https://litellm-api.up.railway.app/#/team%20management/add_team_callbacks_team__team_id__callback_post)
- [`GET /team/{team_id}/callback` - Get the success/failure callbacks and variables for a team](https://litellm-api.up.railway.app/#/team%20management/get_team_callbacks_team__team_id__callback_get)

Binary file not shown.

After

Width:  |  Height:  |  Size: 117 KiB

View file

@ -44,19 +44,20 @@ const sidebars = {
"proxy/cost_tracking",
"proxy/self_serve",
"proxy/virtual_keys",
"proxy/tag_routing",
"proxy/users",
"proxy/team_budgets",
"proxy/customers",
"proxy/billing",
"proxy/guardrails",
"proxy/token_auth",
"proxy/alerting",
{
type: "category",
label: "🪢 Logging",
items: ["proxy/logging", "proxy/streaming_logging"],
},
"proxy/team_logging",
"proxy/guardrails",
"proxy/tag_routing",
"proxy/users",
"proxy/team_budgets",
"proxy/customers",
"proxy/billing",
"proxy/token_auth",
"proxy/alerting",
"proxy/ui",
"proxy/prometheus",
"proxy/pass_through",
@ -157,6 +158,7 @@ const sidebars = {
"providers/triton-inference-server",
"providers/ollama",
"providers/perplexity",
"providers/friendliai",
"providers/groq",
"providers/deepseek",
"providers/fireworks_ai",
@ -183,7 +185,14 @@ const sidebars = {
"scheduler",
"set_keys",
"budget_manager",
"secret",
{
type: "category",
label: "Secret Manager",
items: [
"secret",
"oidc"
]
},
"completion/token_usage",
"load_test",
{
@ -192,17 +201,19 @@ const sidebars = {
items: [
"observability/langfuse_integration",
"observability/logfire_integration",
"observability/langsmith_integration",
"observability/arize_integration",
"debugging/local_debugging",
"observability/raw_request_response",
"observability/custom_callback",
"observability/scrub_data",
"observability/helicone_integration",
"observability/braintrust",
"observability/sentry",
"observability/lago",
"observability/helicone_integration",
"observability/openmeter",
"observability/promptlayer_integration",
"observability/wandb_integration",
"observability/langsmith_integration",
"observability/slack_integration",
"observability/traceloop_integration",
"observability/athina_integration",

View file

@ -4,7 +4,7 @@ import warnings
warnings.filterwarnings("ignore", message=".*conflict with protected namespace.*")
### INIT VARIABLES ###
import threading, requests, os
from typing import Callable, List, Optional, Dict, Union, Any, Literal
from typing import Callable, List, Optional, Dict, Union, Any, Literal, get_args
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
from litellm.caching import Cache
from litellm._logging import (
@ -38,8 +38,18 @@ success_callback: List[Union[str, Callable]] = []
failure_callback: List[Union[str, Callable]] = []
service_callback: List[Union[str, Callable]] = []
_custom_logger_compatible_callbacks_literal = Literal[
"lago", "openmeter", "logfire", "dynamic_rate_limiter", "langsmith", "galileo"
"lago",
"openmeter",
"logfire",
"dynamic_rate_limiter",
"langsmith",
"galileo",
"braintrust",
"arize",
]
_known_custom_logger_compatible_callbacks: List = list(
get_args(_custom_logger_compatible_callbacks_literal)
)
callbacks: List[Union[Callable, _custom_logger_compatible_callbacks_literal]] = []
_langfuse_default_tags: Optional[
List[
@ -67,6 +77,7 @@ post_call_rules: List[Callable] = []
turn_off_message_logging: Optional[bool] = False
log_raw_request_response: bool = False
redact_messages_in_exceptions: Optional[bool] = False
redact_user_api_key_info: Optional[bool] = False
store_audit_logs = False # Enterprise feature, allow users to see audit logs
## end of callbacks #############
@ -346,6 +357,7 @@ vertex_text_models: List = []
vertex_code_text_models: List = []
vertex_embedding_models: List = []
vertex_anthropic_models: List = []
vertex_llama3_models: List = []
ai21_models: List = []
nlp_cloud_models: List = []
aleph_alpha_models: List = []
@ -388,6 +400,9 @@ for key, value in model_cost.items():
elif value.get("litellm_provider") == "vertex_ai-anthropic_models":
key = key.replace("vertex_ai/", "")
vertex_anthropic_models.append(key)
elif value.get("litellm_provider") == "vertex_ai-llama_models":
key = key.replace("vertex_ai/", "")
vertex_llama3_models.append(key)
elif value.get("litellm_provider") == "ai21":
ai21_models.append(key)
elif value.get("litellm_provider") == "nlp_cloud":
@ -817,6 +832,7 @@ from .llms.petals import PetalsConfig
from .llms.vertex_httpx import VertexGeminiConfig, GoogleAIStudioGeminiConfig
from .llms.vertex_ai import VertexAIConfig, VertexAITextEmbeddingConfig
from .llms.vertex_ai_anthropic import VertexAIAnthropicConfig
from .llms.vertex_ai_llama import VertexAILlama3Config
from .llms.sagemaker import SagemakerConfig
from .llms.ollama import OllamaConfig
from .llms.ollama_chat import OllamaChatConfig
@ -872,6 +888,7 @@ from .exceptions import (
APIError,
Timeout,
APIConnectionError,
UnsupportedParamsError,
APIResponseValidationError,
UnprocessableEntityError,
InternalServerError,

View file

@ -682,11 +682,39 @@ class JSONSchemaValidationError(APIError):
)
class UnsupportedParamsError(BadRequestError):
def __init__(
self,
message,
llm_provider: Optional[str] = None,
model: Optional[str] = None,
status_code: int = 400,
response: Optional[httpx.Response] = None,
litellm_debug_info: Optional[str] = None,
max_retries: Optional[int] = None,
num_retries: Optional[int] = None,
):
self.status_code = 400
self.message = "litellm.UnsupportedParamsError: {}".format(message)
self.model = model
self.llm_provider = llm_provider
self.litellm_debug_info = litellm_debug_info
response = response or httpx.Response(
status_code=self.status_code,
request=httpx.Request(
method="GET", url="https://litellm.ai"
), # mock request object
)
self.max_retries = max_retries
self.num_retries = num_retries
LITELLM_EXCEPTION_TYPES = [
AuthenticationError,
NotFoundError,
BadRequestError,
UnprocessableEntityError,
UnsupportedParamsError,
Timeout,
PermissionDeniedError,
RateLimitError,

View file

@ -0,0 +1,286 @@
from enum import Enum
class SpanAttributes:
OUTPUT_VALUE = "output.value"
OUTPUT_MIME_TYPE = "output.mime_type"
"""
The type of output.value. If unspecified, the type is plain text by default.
If type is JSON, the value is a string representing a JSON object.
"""
INPUT_VALUE = "input.value"
INPUT_MIME_TYPE = "input.mime_type"
"""
The type of input.value. If unspecified, the type is plain text by default.
If type is JSON, the value is a string representing a JSON object.
"""
EMBEDDING_EMBEDDINGS = "embedding.embeddings"
"""
A list of objects containing embedding data, including the vector and represented piece of text.
"""
EMBEDDING_MODEL_NAME = "embedding.model_name"
"""
The name of the embedding model.
"""
LLM_FUNCTION_CALL = "llm.function_call"
"""
For models and APIs that support function calling. Records attributes such as the function
name and arguments to the called function.
"""
LLM_INVOCATION_PARAMETERS = "llm.invocation_parameters"
"""
Invocation parameters passed to the LLM or API, such as the model name, temperature, etc.
"""
LLM_INPUT_MESSAGES = "llm.input_messages"
"""
Messages provided to a chat API.
"""
LLM_OUTPUT_MESSAGES = "llm.output_messages"
"""
Messages received from a chat API.
"""
LLM_MODEL_NAME = "llm.model_name"
"""
The name of the model being used.
"""
LLM_PROMPTS = "llm.prompts"
"""
Prompts provided to a completions API.
"""
LLM_PROMPT_TEMPLATE = "llm.prompt_template.template"
"""
The prompt template as a Python f-string.
"""
LLM_PROMPT_TEMPLATE_VARIABLES = "llm.prompt_template.variables"
"""
A list of input variables to the prompt template.
"""
LLM_PROMPT_TEMPLATE_VERSION = "llm.prompt_template.version"
"""
The version of the prompt template being used.
"""
LLM_TOKEN_COUNT_PROMPT = "llm.token_count.prompt"
"""
Number of tokens in the prompt.
"""
LLM_TOKEN_COUNT_COMPLETION = "llm.token_count.completion"
"""
Number of tokens in the completion.
"""
LLM_TOKEN_COUNT_TOTAL = "llm.token_count.total"
"""
Total number of tokens, including both prompt and completion.
"""
TOOL_NAME = "tool.name"
"""
Name of the tool being used.
"""
TOOL_DESCRIPTION = "tool.description"
"""
Description of the tool's purpose, typically used to select the tool.
"""
TOOL_PARAMETERS = "tool.parameters"
"""
Parameters of the tool represented a dictionary JSON string, e.g.
see https://platform.openai.com/docs/guides/gpt/function-calling
"""
RETRIEVAL_DOCUMENTS = "retrieval.documents"
METADATA = "metadata"
"""
Metadata attributes are used to store user-defined key-value pairs.
For example, LangChain uses metadata to store user-defined attributes for a chain.
"""
TAG_TAGS = "tag.tags"
"""
Custom categorical tags for the span.
"""
OPENINFERENCE_SPAN_KIND = "openinference.span.kind"
SESSION_ID = "session.id"
"""
The id of the session
"""
USER_ID = "user.id"
"""
The id of the user
"""
class MessageAttributes:
"""
Attributes for a message sent to or from an LLM
"""
MESSAGE_ROLE = "message.role"
"""
The role of the message, such as "user", "agent", "function".
"""
MESSAGE_CONTENT = "message.content"
"""
The content of the message to or from the llm, must be a string.
"""
MESSAGE_CONTENTS = "message.contents"
"""
The message contents to the llm, it is an array of
`message_content` prefixed attributes.
"""
MESSAGE_NAME = "message.name"
"""
The name of the message, often used to identify the function
that was used to generate the message.
"""
MESSAGE_TOOL_CALLS = "message.tool_calls"
"""
The tool calls generated by the model, such as function calls.
"""
MESSAGE_FUNCTION_CALL_NAME = "message.function_call_name"
"""
The function name that is a part of the message list.
This is populated for role 'function' or 'agent' as a mechanism to identify
the function that was called during the execution of a tool.
"""
MESSAGE_FUNCTION_CALL_ARGUMENTS_JSON = "message.function_call_arguments_json"
"""
The JSON string representing the arguments passed to the function
during a function call.
"""
class MessageContentAttributes:
"""
Attributes for the contents of user messages sent to an LLM.
"""
MESSAGE_CONTENT_TYPE = "message_content.type"
"""
The type of the content, such as "text" or "image".
"""
MESSAGE_CONTENT_TEXT = "message_content.text"
"""
The text content of the message, if the type is "text".
"""
MESSAGE_CONTENT_IMAGE = "message_content.image"
"""
The image content of the message, if the type is "image".
An image can be made available to the model by passing a link to
the image or by passing the base64 encoded image directly in the
request.
"""
class ImageAttributes:
"""
Attributes for images
"""
IMAGE_URL = "image.url"
"""
An http or base64 image url
"""
class DocumentAttributes:
"""
Attributes for a document.
"""
DOCUMENT_ID = "document.id"
"""
The id of the document.
"""
DOCUMENT_SCORE = "document.score"
"""
The score of the document
"""
DOCUMENT_CONTENT = "document.content"
"""
The content of the document.
"""
DOCUMENT_METADATA = "document.metadata"
"""
The metadata of the document represented as a dictionary
JSON string, e.g. `"{ 'title': 'foo' }"`
"""
class RerankerAttributes:
"""
Attributes for a reranker
"""
RERANKER_INPUT_DOCUMENTS = "reranker.input_documents"
"""
List of documents as input to the reranker
"""
RERANKER_OUTPUT_DOCUMENTS = "reranker.output_documents"
"""
List of documents as output from the reranker
"""
RERANKER_QUERY = "reranker.query"
"""
Query string for the reranker
"""
RERANKER_MODEL_NAME = "reranker.model_name"
"""
Model name of the reranker
"""
RERANKER_TOP_K = "reranker.top_k"
"""
Top K parameter of the reranker
"""
class EmbeddingAttributes:
"""
Attributes for an embedding
"""
EMBEDDING_TEXT = "embedding.text"
"""
The text represented by the embedding.
"""
EMBEDDING_VECTOR = "embedding.vector"
"""
The embedding vector.
"""
class ToolCallAttributes:
"""
Attributes for a tool call
"""
TOOL_CALL_FUNCTION_NAME = "tool_call.function.name"
"""
The name of function that is being called during a tool call.
"""
TOOL_CALL_FUNCTION_ARGUMENTS_JSON = "tool_call.function.arguments"
"""
The JSON string representing the arguments passed to the function
during a tool call.
"""
class OpenInferenceSpanKindValues(Enum):
TOOL = "TOOL"
CHAIN = "CHAIN"
LLM = "LLM"
RETRIEVER = "RETRIEVER"
EMBEDDING = "EMBEDDING"
AGENT = "AGENT"
RERANKER = "RERANKER"
UNKNOWN = "UNKNOWN"
GUARDRAIL = "GUARDRAIL"
EVALUATOR = "EVALUATOR"
class OpenInferenceMimeTypeValues(Enum):
TEXT = "text/plain"
JSON = "application/json"

View file

@ -0,0 +1,114 @@
"""
arize AI is OTEL compatible
this file has Arize ai specific helper functions
"""
from typing import TYPE_CHECKING, Any, Optional, Union
if TYPE_CHECKING:
from opentelemetry.trace import Span as _Span
Span = _Span
else:
Span = Any
def set_arize_ai_attributes(span: Span, kwargs, response_obj):
from litellm.integrations._types.open_inference import (
MessageAttributes,
MessageContentAttributes,
OpenInferenceSpanKindValues,
SpanAttributes,
)
optional_params = kwargs.get("optional_params", {})
litellm_params = kwargs.get("litellm_params", {}) or {}
#############################################
############ LLM CALL METADATA ##############
#############################################
# commented out for now - looks like Arize AI could not log this
# metadata = litellm_params.get("metadata", {}) or {}
# span.set_attribute(SpanAttributes.METADATA, str(metadata))
#############################################
########## LLM Request Attributes ###########
#############################################
# The name of the LLM a request is being made to
if kwargs.get("model"):
span.set_attribute(SpanAttributes.LLM_MODEL_NAME, kwargs.get("model"))
span.set_attribute(
SpanAttributes.OPENINFERENCE_SPAN_KIND, OpenInferenceSpanKindValues.LLM.value
)
messages = kwargs.get("messages")
# for /chat/completions
# https://docs.arize.com/arize/large-language-models/tracing/semantic-conventions
if messages:
span.set_attribute(
SpanAttributes.INPUT_VALUE,
messages[-1].get("content", ""), # get the last message for input
)
# LLM_INPUT_MESSAGES shows up under `input_messages` tab on the span page
for idx, msg in enumerate(messages):
# Set the role per message
span.set_attribute(
f"{SpanAttributes.LLM_INPUT_MESSAGES}.{idx}.{MessageAttributes.MESSAGE_ROLE}",
msg["role"],
)
# Set the content per message
span.set_attribute(
f"{SpanAttributes.LLM_INPUT_MESSAGES}.{idx}.{MessageAttributes.MESSAGE_CONTENT}",
msg.get("content", ""),
)
# The Generative AI Provider: Azure, OpenAI, etc.
span.set_attribute(SpanAttributes.LLM_INVOCATION_PARAMETERS, str(optional_params))
if optional_params.get("user"):
span.set_attribute(SpanAttributes.USER_ID, optional_params.get("user"))
#############################################
########## LLM Response Attributes ##########
# https://docs.arize.com/arize/large-language-models/tracing/semantic-conventions
#############################################
for choice in response_obj.get("choices"):
response_message = choice.get("message", {})
span.set_attribute(
SpanAttributes.OUTPUT_VALUE, response_message.get("content", "")
)
# This shows up under `output_messages` tab on the span page
# This code assumes a single response
span.set_attribute(
f"{SpanAttributes.LLM_OUTPUT_MESSAGES}.0.{MessageAttributes.MESSAGE_ROLE}",
response_message["role"],
)
span.set_attribute(
f"{SpanAttributes.LLM_OUTPUT_MESSAGES}.0.{MessageAttributes.MESSAGE_CONTENT}",
response_message.get("content", ""),
)
usage = response_obj.get("usage")
if usage:
span.set_attribute(
SpanAttributes.LLM_TOKEN_COUNT_TOTAL,
usage.get("total_tokens"),
)
# The number of tokens used in the LLM response (completion).
span.set_attribute(
SpanAttributes.LLM_TOKEN_COUNT_COMPLETION,
usage.get("completion_tokens"),
)
# The number of tokens used in the LLM prompt.
span.set_attribute(
SpanAttributes.LLM_TOKEN_COUNT_PROMPT,
usage.get("prompt_tokens"),
)
pass

View file

@ -0,0 +1,369 @@
# What is this?
## Log success + failure events to Braintrust
import copy
import json
import os
import threading
import traceback
import uuid
from typing import Literal, Optional
import dotenv
import httpx
import litellm
from litellm import verbose_logger
from litellm.integrations.custom_logger import CustomLogger
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
from litellm.utils import get_formatted_prompt
global_braintrust_http_handler = AsyncHTTPHandler()
global_braintrust_sync_http_handler = HTTPHandler()
API_BASE = "https://api.braintrustdata.com/v1"
def get_utc_datetime():
import datetime as dt
from datetime import datetime
if hasattr(dt, "UTC"):
return datetime.now(dt.UTC) # type: ignore
else:
return datetime.utcnow() # type: ignore
class BraintrustLogger(CustomLogger):
def __init__(
self, api_key: Optional[str] = None, api_base: Optional[str] = None
) -> None:
super().__init__()
self.validate_environment(api_key=api_key)
self.api_base = api_base or API_BASE
self.default_project_id = None
self.api_key: str = api_key or os.getenv("BRAINTRUST_API_KEY") # type: ignore
self.headers = {
"Authorization": "Bearer " + self.api_key,
"Content-Type": "application/json",
}
def validate_environment(self, api_key: Optional[str]):
"""
Expects
BRAINTRUST_API_KEY
in the environment
"""
missing_keys = []
if api_key is None and os.getenv("BRAINTRUST_API_KEY", None) is None:
missing_keys.append("BRAINTRUST_API_KEY")
if len(missing_keys) > 0:
raise Exception("Missing keys={} in environment.".format(missing_keys))
@staticmethod
def add_metadata_from_header(litellm_params: dict, metadata: dict) -> dict:
"""
Adds metadata from proxy request headers to Langfuse logging if keys start with "langfuse_"
and overwrites litellm_params.metadata if already included.
For example if you want to append your trace to an existing `trace_id` via header, send
`headers: { ..., langfuse_existing_trace_id: your-existing-trace-id }` via proxy request.
"""
if litellm_params is None:
return metadata
if litellm_params.get("proxy_server_request") is None:
return metadata
if metadata is None:
metadata = {}
proxy_headers = (
litellm_params.get("proxy_server_request", {}).get("headers", {}) or {}
)
for metadata_param_key in proxy_headers:
if metadata_param_key.startswith("braintrust"):
trace_param_key = metadata_param_key.replace("braintrust", "", 1)
if trace_param_key in metadata:
verbose_logger.warning(
f"Overwriting Braintrust `{trace_param_key}` from request header"
)
else:
verbose_logger.debug(
f"Found Braintrust `{trace_param_key}` in request header"
)
metadata[trace_param_key] = proxy_headers.get(metadata_param_key)
return metadata
async def create_default_project_and_experiment(self):
project = await global_braintrust_http_handler.post(
f"{self.api_base}/project", headers=self.headers, json={"name": "litellm"}
)
project_dict = project.json()
self.default_project_id = project_dict["id"]
def create_sync_default_project_and_experiment(self):
project = global_braintrust_sync_http_handler.post(
f"{self.api_base}/project", headers=self.headers, json={"name": "litellm"}
)
project_dict = project.json()
self.default_project_id = project_dict["id"]
def log_success_event(self, kwargs, response_obj, start_time, end_time):
verbose_logger.debug("REACHES BRAINTRUST SUCCESS")
try:
litellm_call_id = kwargs.get("litellm_call_id")
project_id = kwargs.get("project_id", None)
if project_id is None:
if self.default_project_id is None:
self.create_sync_default_project_and_experiment()
project_id = self.default_project_id
prompt = {"messages": kwargs.get("messages")}
if response_obj is not None and (
kwargs.get("call_type", None) == "embedding"
or isinstance(response_obj, litellm.EmbeddingResponse)
):
input = prompt
output = None
elif response_obj is not None and isinstance(
response_obj, litellm.ModelResponse
):
input = prompt
output = response_obj["choices"][0]["message"].json()
elif response_obj is not None and isinstance(
response_obj, litellm.TextCompletionResponse
):
input = prompt
output = response_obj.choices[0].text
elif response_obj is not None and isinstance(
response_obj, litellm.ImageResponse
):
input = prompt
output = response_obj["data"]
litellm_params = kwargs.get("litellm_params", {})
metadata = (
litellm_params.get("metadata", {}) or {}
) # if litellm_params['metadata'] == None
metadata = self.add_metadata_from_header(litellm_params, metadata)
clean_metadata = {}
try:
metadata = copy.deepcopy(
metadata
) # Avoid modifying the original metadata
except:
new_metadata = {}
for key, value in metadata.items():
if (
isinstance(value, list)
or isinstance(value, dict)
or isinstance(value, str)
or isinstance(value, int)
or isinstance(value, float)
):
new_metadata[key] = copy.deepcopy(value)
metadata = new_metadata
tags = []
if isinstance(metadata, dict):
for key, value in metadata.items():
# generate langfuse tags - Default Tags sent to Langfuse from LiteLLM Proxy
if (
litellm._langfuse_default_tags is not None
and isinstance(litellm._langfuse_default_tags, list)
and key in litellm._langfuse_default_tags
):
tags.append(f"{key}:{value}")
# clean litellm metadata before logging
if key in [
"headers",
"endpoint",
"caching_groups",
"previous_models",
]:
continue
else:
clean_metadata[key] = value
cost = kwargs.get("response_cost", None)
if cost is not None:
clean_metadata["litellm_response_cost"] = cost
metrics: Optional[dict] = None
if (
response_obj is not None
and hasattr(response_obj, "usage")
and isinstance(response_obj.usage, litellm.Usage)
):
generation_id = litellm.utils.get_logging_id(start_time, response_obj)
metrics = {
"prompt_tokens": response_obj.usage.prompt_tokens,
"completion_tokens": response_obj.usage.completion_tokens,
"total_tokens": response_obj.usage.total_tokens,
"total_cost": cost,
}
request_data = {
"id": litellm_call_id,
"input": prompt,
"output": output,
"metadata": clean_metadata,
"tags": tags,
}
if metrics is not None:
request_data["metrics"] = metrics
try:
global_braintrust_sync_http_handler.post(
url=f"{self.api_base}/project_logs/{project_id}/insert",
json={"events": [request_data]},
headers=self.headers,
)
except httpx.HTTPStatusError as e:
raise Exception(e.response.text)
except Exception as e:
verbose_logger.error(
"Error logging to braintrust - Exception received - {}\n{}".format(
str(e), traceback.format_exc()
)
)
raise e
async def async_log_success_event(self, kwargs, response_obj, start_time, end_time):
verbose_logger.debug("REACHES BRAINTRUST SUCCESS")
try:
litellm_call_id = kwargs.get("litellm_call_id")
project_id = kwargs.get("project_id", None)
if project_id is None:
if self.default_project_id is None:
await self.create_default_project_and_experiment()
project_id = self.default_project_id
prompt = {"messages": kwargs.get("messages")}
if response_obj is not None and (
kwargs.get("call_type", None) == "embedding"
or isinstance(response_obj, litellm.EmbeddingResponse)
):
input = prompt
output = None
elif response_obj is not None and isinstance(
response_obj, litellm.ModelResponse
):
input = prompt
output = response_obj["choices"][0]["message"].json()
elif response_obj is not None and isinstance(
response_obj, litellm.TextCompletionResponse
):
input = prompt
output = response_obj.choices[0].text
elif response_obj is not None and isinstance(
response_obj, litellm.ImageResponse
):
input = prompt
output = response_obj["data"]
litellm_params = kwargs.get("litellm_params", {})
metadata = (
litellm_params.get("metadata", {}) or {}
) # if litellm_params['metadata'] == None
metadata = self.add_metadata_from_header(litellm_params, metadata)
clean_metadata = {}
try:
metadata = copy.deepcopy(
metadata
) # Avoid modifying the original metadata
except:
new_metadata = {}
for key, value in metadata.items():
if (
isinstance(value, list)
or isinstance(value, dict)
or isinstance(value, str)
or isinstance(value, int)
or isinstance(value, float)
):
new_metadata[key] = copy.deepcopy(value)
metadata = new_metadata
tags = []
if isinstance(metadata, dict):
for key, value in metadata.items():
# generate langfuse tags - Default Tags sent to Langfuse from LiteLLM Proxy
if (
litellm._langfuse_default_tags is not None
and isinstance(litellm._langfuse_default_tags, list)
and key in litellm._langfuse_default_tags
):
tags.append(f"{key}:{value}")
# clean litellm metadata before logging
if key in [
"headers",
"endpoint",
"caching_groups",
"previous_models",
]:
continue
else:
clean_metadata[key] = value
cost = kwargs.get("response_cost", None)
if cost is not None:
clean_metadata["litellm_response_cost"] = cost
metrics: Optional[dict] = None
if (
response_obj is not None
and hasattr(response_obj, "usage")
and isinstance(response_obj.usage, litellm.Usage)
):
generation_id = litellm.utils.get_logging_id(start_time, response_obj)
metrics = {
"prompt_tokens": response_obj.usage.prompt_tokens,
"completion_tokens": response_obj.usage.completion_tokens,
"total_tokens": response_obj.usage.total_tokens,
"total_cost": cost,
}
request_data = {
"id": litellm_call_id,
"input": prompt,
"output": output,
"metadata": clean_metadata,
"tags": tags,
}
if metrics is not None:
request_data["metrics"] = metrics
try:
await global_braintrust_http_handler.post(
url=f"{self.api_base}/project_logs/{project_id}/insert",
json={"events": [request_data]},
headers=self.headers,
)
except httpx.HTTPStatusError as e:
raise Exception(e.response.text)
except Exception as e:
verbose_logger.error(
"Error logging to braintrust - Exception received - {}\n{}".format(
str(e), traceback.format_exc()
)
)
raise e
def log_failure_event(self, kwargs, response_obj, start_time, end_time):
return super().log_failure_event(kwargs, response_obj, start_time, end_time)

View file

@ -8,6 +8,7 @@ from packaging.version import Version
import litellm
from litellm._logging import verbose_logger
from litellm.litellm_core_utils.redact_messages import redact_user_api_key_info
class LangFuseLogger:
@ -382,6 +383,8 @@ class LangFuseLogger:
mask_input = clean_metadata.pop("mask_input", False)
mask_output = clean_metadata.pop("mask_output", False)
clean_metadata = redact_user_api_key_info(metadata=clean_metadata)
if trace_name is None and existing_trace_id is None:
# just log `litellm-{call_type}` as the trace name
## DO NOT SET TRACE_NAME if trace-id set. this can lead to overwriting of past traces.

View file

@ -79,6 +79,7 @@ class LangsmithLogger(CustomLogger):
project_name = metadata.get("project_name", self.langsmith_project)
run_name = metadata.get("run_name", self.langsmith_default_run_name)
run_id = metadata.get("id", None)
tags = metadata.get("tags", []) or []
verbose_logger.debug(
f"Langsmith Logging - project_name: {project_name}, run_name {run_name}"
)
@ -122,6 +123,7 @@ class LangsmithLogger(CustomLogger):
"session_name": project_name,
"start_time": start_time,
"end_time": end_time,
"tags": tags,
}
if run_id:

View file

@ -1,17 +1,21 @@
#### What this does ####
# On success + failure, log events to Logfire
import dotenv, os
import os
import dotenv
dotenv.load_dotenv() # Loading env variables using dotenv
import traceback
import uuid
from litellm._logging import print_verbose, verbose_logger
from enum import Enum
from typing import Any, Dict, NamedTuple
from typing_extensions import LiteralString
from litellm._logging import print_verbose, verbose_logger
from litellm.litellm_core_utils.redact_messages import redact_user_api_key_info
class SpanConfig(NamedTuple):
message_template: LiteralString
@ -135,6 +139,8 @@ class LogfireLogger:
else:
clean_metadata[key] = value
clean_metadata = redact_user_api_key_info(metadata=clean_metadata)
# Build the initial payload
payload = {
"id": id,

View file

@ -2,11 +2,12 @@ import os
from dataclasses import dataclass
from datetime import datetime
from functools import wraps
from typing import TYPE_CHECKING, Any, Optional, Union
from typing import TYPE_CHECKING, Any, Dict, Optional, Union
import litellm
from litellm._logging import verbose_logger
from litellm.integrations.custom_logger import CustomLogger
from litellm.litellm_core_utils.redact_messages import redact_user_api_key_info
from litellm.types.services import ServiceLoggerPayload
if TYPE_CHECKING:
@ -27,9 +28,10 @@ else:
LITELLM_TRACER_NAME = os.getenv("OTEL_TRACER_NAME", "litellm")
LITELLM_RESOURCE = {
LITELLM_RESOURCE: Dict[Any, Any] = {
"service.name": os.getenv("OTEL_SERVICE_NAME", "litellm"),
"deployment.environment": os.getenv("OTEL_ENVIRONMENT_NAME", "production"),
"model_id": os.getenv("OTEL_SERVICE_NAME", "litellm"),
}
RAW_REQUEST_SPAN_NAME = "raw_gen_ai_request"
LITELLM_REQUEST_SPAN_NAME = "litellm_request"
@ -68,7 +70,9 @@ class OpenTelemetryConfig:
class OpenTelemetry(CustomLogger):
def __init__(self, config=OpenTelemetryConfig.from_env()):
def __init__(
self, config=OpenTelemetryConfig.from_env(), callback_name: Optional[str] = None
):
from opentelemetry import trace
from opentelemetry.sdk.resources import Resource
from opentelemetry.sdk.trace import TracerProvider
@ -79,6 +83,7 @@ class OpenTelemetry(CustomLogger):
self.OTEL_HEADERS = self.config.headers
provider = TracerProvider(resource=Resource(attributes=LITELLM_RESOURCE))
provider.add_span_processor(self._get_span_processor())
self.callback_name = callback_name
trace.set_tracer_provider(provider)
self.tracer = trace.get_tracer(LITELLM_TRACER_NAME)
@ -120,8 +125,8 @@ class OpenTelemetry(CustomLogger):
from opentelemetry import trace
from opentelemetry.trace import Status, StatusCode
_start_time_ns = start_time
_end_time_ns = end_time
_start_time_ns = 0
_end_time_ns = 0
if isinstance(start_time, float):
_start_time_ns = int(int(start_time) * 1e9)
@ -159,8 +164,8 @@ class OpenTelemetry(CustomLogger):
from opentelemetry import trace
from opentelemetry.trace import Status, StatusCode
_start_time_ns = start_time
_end_time_ns = end_time
_start_time_ns = 0
_end_time_ns = 0
if isinstance(start_time, float):
_start_time_ns = int(int(start_time) * 1e9)
@ -294,6 +299,11 @@ class OpenTelemetry(CustomLogger):
return isinstance(value, (str, bool, int, float))
def set_attributes(self, span: Span, kwargs, response_obj):
if self.callback_name == "arize":
from litellm.integrations.arize_ai import set_arize_ai_attributes
set_arize_ai_attributes(span, kwargs, response_obj)
return
from litellm.proxy._types import SpanAttributes
optional_params = kwargs.get("optional_params", {})
@ -306,7 +316,9 @@ class OpenTelemetry(CustomLogger):
#############################################
metadata = litellm_params.get("metadata", {}) or {}
for key, value in metadata.items():
clean_metadata = redact_user_api_key_info(metadata=metadata)
for key, value in clean_metadata.items():
if self.is_primitive(value):
span.set_attribute("metadata.{}".format(key), value)
@ -612,8 +624,8 @@ class OpenTelemetry(CustomLogger):
from opentelemetry import trace
from opentelemetry.trace import Status, StatusCode
_start_time_ns = logging_payload.start_time
_end_time_ns = logging_payload.end_time
_start_time_ns = 0
_end_time_ns = 0
start_time = logging_payload.start_time
end_time = logging_payload.end_time
@ -658,8 +670,8 @@ class OpenTelemetry(CustomLogger):
from opentelemetry import trace
from opentelemetry.trace import Status, StatusCode
_start_time_ns = logging_payload.start_time
_end_time_ns = logging_payload.end_time
_start_time_ns = 0
_end_time_ns = 0
start_time = logging_payload.start_time
end_time = logging_payload.end_time

View file

@ -53,6 +53,7 @@ from litellm.utils import (
from ..integrations.aispend import AISpendLogger
from ..integrations.athina import AthinaLogger
from ..integrations.berrispend import BerriSpendLogger
from ..integrations.braintrust_logging import BraintrustLogger
from ..integrations.clickhouse import ClickhouseLogger
from ..integrations.custom_logger import CustomLogger
from ..integrations.datadog import DataDogLogger
@ -1945,7 +1946,14 @@ def _init_custom_logger_compatible_class(
_openmeter_logger = OpenMeterLogger()
_in_memory_loggers.append(_openmeter_logger)
return _openmeter_logger # type: ignore
elif logging_integration == "braintrust":
for callback in _in_memory_loggers:
if isinstance(callback, BraintrustLogger):
return callback # type: ignore
braintrust_logger = BraintrustLogger()
_in_memory_loggers.append(braintrust_logger)
return braintrust_logger # type: ignore
elif logging_integration == "langsmith":
for callback in _in_memory_loggers:
if isinstance(callback, LangsmithLogger):
@ -1954,6 +1962,43 @@ def _init_custom_logger_compatible_class(
_langsmith_logger = LangsmithLogger()
_in_memory_loggers.append(_langsmith_logger)
return _langsmith_logger # type: ignore
elif logging_integration == "arize":
if "ARIZE_SPACE_KEY" not in os.environ:
raise ValueError("ARIZE_SPACE_KEY not found in environment variables")
if "ARIZE_API_KEY" not in os.environ:
raise ValueError("ARIZE_API_KEY not found in environment variables")
from litellm.integrations.opentelemetry import (
OpenTelemetry,
OpenTelemetryConfig,
)
otel_config = OpenTelemetryConfig(
exporter="otlp_grpc",
endpoint="https://otlp.arize.com/v1",
)
os.environ["OTEL_EXPORTER_OTLP_TRACES_HEADERS"] = (
f"space_key={os.getenv('ARIZE_SPACE_KEY')},api_key={os.getenv('ARIZE_API_KEY')}"
)
for callback in _in_memory_loggers:
if (
isinstance(callback, OpenTelemetry)
and callback.callback_name == "arize"
):
return callback # type: ignore
_otel_logger = OpenTelemetry(config=otel_config, callback_name="arize")
_in_memory_loggers.append(_otel_logger)
return _otel_logger # type: ignore
elif logging_integration == "otel":
from litellm.integrations.opentelemetry import OpenTelemetry
for callback in _in_memory_loggers:
if isinstance(callback, OpenTelemetry):
return callback # type: ignore
otel_logger = OpenTelemetry()
_in_memory_loggers.append(otel_logger)
return otel_logger # type: ignore
elif logging_integration == "galileo":
for callback in _in_memory_loggers:
@ -2019,6 +2064,10 @@ def get_custom_logger_compatible_class(
for callback in _in_memory_loggers:
if isinstance(callback, OpenMeterLogger):
return callback
elif logging_integration == "braintrust":
for callback in _in_memory_loggers:
if isinstance(callback, BraintrustLogger):
return callback
elif logging_integration == "galileo":
for callback in _in_memory_loggers:
if isinstance(callback, GalileoObserve):
@ -2027,6 +2076,25 @@ def get_custom_logger_compatible_class(
for callback in _in_memory_loggers:
if isinstance(callback, LangsmithLogger):
return callback
elif logging_integration == "otel":
from litellm.integrations.opentelemetry import OpenTelemetry
for callback in _in_memory_loggers:
if isinstance(callback, OpenTelemetry):
return callback
elif logging_integration == "arize":
from litellm.integrations.opentelemetry import OpenTelemetry
if "ARIZE_SPACE_KEY" not in os.environ:
raise ValueError("ARIZE_SPACE_KEY not found in environment variables")
if "ARIZE_API_KEY" not in os.environ:
raise ValueError("ARIZE_API_KEY not found in environment variables")
for callback in _in_memory_loggers:
if (
isinstance(callback, OpenTelemetry)
and callback.callback_name == "arize"
):
return callback
elif logging_integration == "logfire":
if "LOGFIRE_TOKEN" not in os.environ:
raise ValueError("LOGFIRE_TOKEN not found in environment variables")

View file

@ -87,3 +87,33 @@ def redact_message_input_output_from_logging(
# by default return result
return result
def redact_user_api_key_info(metadata: dict) -> dict:
"""
removes any user_api_key_info before passing to logging object, if flag set
Usage:
SDK
```python
litellm.redact_user_api_key_info = True
```
PROXY:
```yaml
litellm_settings:
redact_user_api_key_info: true
```
"""
if litellm.redact_user_api_key_info is not True:
return metadata
new_metadata = {}
for k, v in metadata.items():
if isinstance(k, str) and k.startswith("user_api_key"):
pass
else:
new_metadata[k] = v
return new_metadata

View file

@ -385,6 +385,11 @@ class AnthropicConfig:
if "user_id" in anthropic_message_request["metadata"]:
new_kwargs["user"] = anthropic_message_request["metadata"]["user_id"]
# Pass litellm proxy specific metadata
if "litellm_metadata" in anthropic_message_request:
# metadata will be passed to litellm.acompletion(), it's a litellm_param
new_kwargs["metadata"] = anthropic_message_request.pop("litellm_metadata")
## CONVERT TOOL CHOICE
if "tool_choice" in anthropic_message_request:
new_kwargs["tool_choice"] = self.translate_anthropic_tool_choice_to_openai(
@ -775,8 +780,17 @@ class AnthropicChatCompletion(BaseLLM):
system_prompt = ""
for idx, message in enumerate(messages):
if message["role"] == "system":
system_prompt += message["content"]
system_prompt_indices.append(idx)
valid_content: bool = False
if isinstance(message["content"], str):
system_prompt += message["content"]
valid_content = True
elif isinstance(message["content"], list):
for content in message["content"]:
system_prompt += content.get("text", "")
valid_content = True
if valid_content:
system_prompt_indices.append(idx)
if len(system_prompt_indices) > 0:
for idx in reversed(system_prompt_indices):
messages.pop(idx)

View file

@ -76,6 +76,8 @@ BEDROCK_CONVERSE_MODELS = [
"anthropic.claude-v1",
"anthropic.claude-instant-v1",
"ai21.jamba-instruct-v1:0",
"meta.llama3-1-8b-instruct-v1:0",
"meta.llama3-1-70b-instruct-v1:0",
]
@ -1729,7 +1731,7 @@ class BedrockConverseLLM(BaseLLM):
headers={},
client: Optional[AsyncHTTPHandler] = None,
) -> Union[ModelResponse, CustomStreamWrapper]:
if client is None:
if client is None or not isinstance(client, AsyncHTTPHandler):
_params = {}
if timeout is not None:
if isinstance(timeout, float) or isinstance(timeout, int):

View file

@ -968,7 +968,7 @@ class OpenAIChatCompletion(BaseLLM):
except openai.UnprocessableEntityError as e:
## check if body contains unprocessable params - related issue https://github.com/BerriAI/litellm/issues/4800
if litellm.drop_params is True or drop_params is True:
if e.body is not None and e.body.get("detail"): # type: ignore
if e.body is not None and isinstance(e.body, dict) and e.body.get("detail"): # type: ignore
detail = e.body.get("detail") # type: ignore
invalid_params: List[str] = []
if (
@ -1100,7 +1100,7 @@ class OpenAIChatCompletion(BaseLLM):
except openai.UnprocessableEntityError as e:
## check if body contains unprocessable params - related issue https://github.com/BerriAI/litellm/issues/4800
if litellm.drop_params is True or drop_params is True:
if e.body is not None and e.body.get("detail"): # type: ignore
if e.body is not None and isinstance(e.body, dict) and e.body.get("detail"): # type: ignore
detail = e.body.get("detail") # type: ignore
invalid_params: List[str] = []
if (
@ -1231,7 +1231,7 @@ class OpenAIChatCompletion(BaseLLM):
except openai.UnprocessableEntityError as e:
## check if body contains unprocessable params - related issue https://github.com/BerriAI/litellm/issues/4800
if litellm.drop_params is True or drop_params is True:
if e.body is not None and e.body.get("detail"): # type: ignore
if e.body is not None and isinstance(e.body, dict) and e.body.get("detail"): # type: ignore
detail = e.body.get("detail") # type: ignore
invalid_params: List[str] = []
if (

View file

@ -1,23 +1,31 @@
import copy
import json
import os
import time
import types
from enum import Enum
from typing import Callable, List, Optional
from typing import Any, Callable, Dict, List, Optional, Sequence, Union
import httpx # type: ignore
import requests # type: ignore
import litellm
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
from litellm.utils import (
Choices,
CustomStreamWrapper,
Delta,
EmbeddingResponse,
Message,
ModelResponse,
Usage,
map_finish_reason,
)
from .base import BaseLLM
from .prompt_templates.factory import custom_prompt, prompt_factory
class TritonError(Exception):
def __init__(self, status_code, message):
def __init__(self, status_code: int, message: str) -> None:
self.status_code = status_code
self.message = message
self.request = httpx.Request(
@ -41,8 +49,7 @@ class TritonChatCompletion(BaseLLM):
api_base: str,
logging_obj=None,
api_key: Optional[str] = None,
):
) -> EmbeddingResponse:
async_handler = AsyncHTTPHandler(
timeout=httpx.Timeout(timeout=600.0, connect=5.0)
)
@ -79,10 +86,10 @@ class TritonChatCompletion(BaseLLM):
return model_response
def embedding(
async def embedding(
self,
model: str,
input: list,
input: List[str],
timeout: float,
api_base: str,
model_response: litellm.utils.EmbeddingResponse,
@ -90,8 +97,8 @@ class TritonChatCompletion(BaseLLM):
logging_obj=None,
optional_params=None,
client=None,
aembedding=None,
):
aembedding: bool = False,
) -> EmbeddingResponse:
data_for_triton = {
"inputs": [
{
@ -103,8 +110,6 @@ class TritonChatCompletion(BaseLLM):
]
}
## LOGGING
curl_string = f"curl {api_base} -X POST -H 'Content-Type: application/json' -d '{data_for_triton}'"
logging_obj.pre_call(
@ -116,8 +121,8 @@ class TritonChatCompletion(BaseLLM):
},
)
if aembedding == True:
response = self.aembedding(
if aembedding:
response = await self.aembedding(
data=data_for_triton,
model_response=model_response,
logging_obj=logging_obj,
@ -130,6 +135,198 @@ class TritonChatCompletion(BaseLLM):
"Only async embedding supported for triton, please use litellm.aembedding() for now"
)
def completion(
self,
model: str,
messages: List[dict],
timeout: float,
api_base: str,
model_response: ModelResponse,
api_key: Optional[str] = None,
logging_obj=None,
optional_params=None,
client=None,
stream: Optional[bool] = False,
acompletion: bool = False,
) -> ModelResponse:
type_of_model = ""
optional_params.pop("stream", False)
if api_base.endswith("generate"): ### This is a trtllm model
text_input = messages[0]["content"]
data_for_triton: Dict[str, Any] = {
"text_input": prompt_factory(model=model, messages=messages),
"parameters": {
"max_tokens": int(optional_params.get("max_tokens", 2000)),
"bad_words": [""],
"stop_words": [""],
},
"stream": bool(stream),
}
data_for_triton["parameters"].update(optional_params)
type_of_model = "trtllm"
elif api_base.endswith(
"infer"
): ### This is an infer model with a custom model on triton
text_input = messages[0]["content"]
data_for_triton = {
"inputs": [
{
"name": "text_input",
"shape": [1],
"datatype": "BYTES",
"data": [text_input],
}
]
}
for k, v in optional_params.items():
if not (k == "stream" or k == "max_retries"):
datatype = "INT32" if isinstance(v, int) else "BYTES"
datatype = "FP32" if isinstance(v, float) else datatype
data_for_triton["inputs"].append(
{"name": k, "shape": [1], "datatype": datatype, "data": [v]}
)
if "max_tokens" not in optional_params:
data_for_triton["inputs"].append(
{
"name": "max_tokens",
"shape": [1],
"datatype": "INT32",
"data": [20],
}
)
type_of_model = "infer"
else: ## Unknown model type passthrough
data_for_triton = {
"inputs": [
{
"name": "text_input",
"shape": [1],
"datatype": "BYTES",
"data": [messages[0]["content"]],
}
]
}
if logging_obj:
logging_obj.pre_call(
input=messages,
api_key=api_key,
additional_args={
"complete_input_dict": optional_params,
"api_base": api_base,
"http_client": client,
},
)
headers = {"Content-Type": "application/json"}
json_data_for_triton: str = json.dumps(data_for_triton)
if acompletion:
return self.acompletion( # type: ignore
model,
json_data_for_triton,
headers=headers,
logging_obj=logging_obj,
api_base=api_base,
stream=stream,
model_response=model_response,
type_of_model=type_of_model,
)
else:
handler = HTTPHandler()
if stream:
return self._handle_stream(
handler, api_base, data_for_triton, model, logging_obj
)
else:
response = handler.post(url=api_base, data=data_for_triton, headers=headers)
return self._handle_response(
response, model_response, logging_obj, type_of_model=type_of_model
)
async def acompletion(
self,
model: str,
data_for_triton,
api_base,
stream,
logging_obj,
headers,
model_response,
type_of_model,
) -> ModelResponse:
handler = AsyncHTTPHandler()
if stream:
return self._ahandle_stream(
handler, api_base, data_for_triton, model, logging_obj
)
else:
response = await handler.post(
url=api_base, data=data_for_triton, headers=headers
)
return self._handle_response(
response, model_response, logging_obj, type_of_model=type_of_model
)
def _handle_stream(self, handler, api_base, data_for_triton, model, logging_obj):
response = handler.post(
url=api_base + "_stream", data=data_for_triton, stream=True
)
streamwrapper = litellm.CustomStreamWrapper(
response.iter_lines(),
model=model,
custom_llm_provider="triton",
logging_obj=logging_obj,
)
for chunk in streamwrapper:
yield (chunk)
async def _ahandle_stream(
self, handler, api_base, data_for_triton, model, logging_obj
):
response = await handler.post(
url=api_base + "_stream", data=data_for_triton, stream=True
)
streamwrapper = litellm.CustomStreamWrapper(
response.aiter_lines(),
model=model,
custom_llm_provider="triton",
logging_obj=logging_obj,
)
async for chunk in streamwrapper:
yield (chunk)
def _handle_response(self, response, model_response, logging_obj, type_of_model):
if logging_obj:
logging_obj.post_call(original_response=response)
if response.status_code != 200:
raise TritonError(status_code=response.status_code, message=response.text)
_json_response = response.json()
model_response.model = _json_response.get("model_name", "None")
if type_of_model == "trtllm":
model_response.choices = [
Choices(index=0, message=Message(content=_json_response["text_output"]))
]
elif type_of_model == "infer":
model_response.choices = [
Choices(
index=0,
message=Message(content=_json_response["outputs"][0]["data"]),
)
]
else:
model_response.choices = [
Choices(index=0, message=Message(content=_json_response["outputs"]))
]
return model_response
@staticmethod
def split_embedding_by_shape(
data: List[float], shape: List[int]

View file

@ -0,0 +1,203 @@
# What is this?
## Handler for calling llama 3.1 API on Vertex AI
import copy
import json
import os
import time
import types
import uuid
from enum import Enum
from typing import Any, Callable, List, Optional, Tuple, Union
import httpx # type: ignore
import requests # type: ignore
import litellm
from litellm.litellm_core_utils.core_helpers import map_finish_reason
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
from litellm.types.llms.anthropic import (
AnthropicMessagesTool,
AnthropicMessagesToolChoice,
)
from litellm.types.llms.openai import (
ChatCompletionToolParam,
ChatCompletionToolParamFunctionChunk,
)
from litellm.types.utils import ResponseFormatChunk
from litellm.utils import CustomStreamWrapper, ModelResponse, Usage
from .base import BaseLLM
from .prompt_templates.factory import (
construct_tool_use_system_prompt,
contains_tag,
custom_prompt,
extract_between_tags,
parse_xml_params,
prompt_factory,
response_schema_prompt,
)
class VertexAIError(Exception):
def __init__(self, status_code, message):
self.status_code = status_code
self.message = message
self.request = httpx.Request(
method="POST", url=" https://cloud.google.com/vertex-ai/"
)
self.response = httpx.Response(status_code=status_code, request=self.request)
super().__init__(
self.message
) # Call the base class constructor with the parameters it needs
class VertexAILlama3Config:
"""
Reference:https://cloud.google.com/vertex-ai/generative-ai/docs/partner-models/llama#streaming
The class `VertexAILlama3Config` provides configuration for the VertexAI's Llama API interface. Below are the parameters:
- `max_tokens` Required (integer) max tokens,
Note: Please make sure to modify the default parameters as required for your use case.
"""
max_tokens: Optional[int] = None
def __init__(
self,
max_tokens: Optional[int] = None,
) -> None:
locals_ = locals()
for key, value in locals_.items():
if key == "max_tokens" and value is None:
value = self.max_tokens
if key != "self" and value is not None:
setattr(self.__class__, key, value)
@classmethod
def get_config(cls):
return {
k: v
for k, v in cls.__dict__.items()
if not k.startswith("__")
and not isinstance(
v,
(
types.FunctionType,
types.BuiltinFunctionType,
classmethod,
staticmethod,
),
)
and v is not None
}
def get_supported_openai_params(self):
return [
"max_tokens",
"stream",
]
def map_openai_params(self, non_default_params: dict, optional_params: dict):
for param, value in non_default_params.items():
if param == "max_tokens":
optional_params["max_tokens"] = value
return optional_params
class VertexAILlama3(BaseLLM):
def __init__(self) -> None:
pass
def create_vertex_llama3_url(
self, vertex_location: str, vertex_project: str
) -> str:
return f"https://{vertex_location}-aiplatform.googleapis.com/v1beta1/projects/{vertex_project}/locations/{vertex_location}/endpoints/openapi"
def completion(
self,
model: str,
messages: list,
model_response: ModelResponse,
print_verbose: Callable,
encoding,
logging_obj,
optional_params: dict,
custom_prompt_dict: dict,
headers: Optional[dict],
timeout: Union[float, httpx.Timeout],
vertex_project=None,
vertex_location=None,
vertex_credentials=None,
litellm_params=None,
logger_fn=None,
acompletion: bool = False,
client=None,
):
try:
import vertexai
from google.cloud import aiplatform
from litellm.llms.openai import OpenAIChatCompletion
from litellm.llms.vertex_httpx import VertexLLM
except Exception:
raise VertexAIError(
status_code=400,
message="""vertexai import failed please run `pip install -U "google-cloud-aiplatform>=1.38"`""",
)
if not (
hasattr(vertexai, "preview") or hasattr(vertexai.preview, "language_models")
):
raise VertexAIError(
status_code=400,
message="""Upgrade vertex ai. Run `pip install "google-cloud-aiplatform>=1.38"`""",
)
try:
vertex_httpx_logic = VertexLLM()
access_token, project_id = vertex_httpx_logic._ensure_access_token(
credentials=vertex_credentials, project_id=vertex_project
)
openai_chat_completions = OpenAIChatCompletion()
## Load Config
# config = litellm.VertexAILlama3.get_config()
# for k, v in config.items():
# if k not in optional_params:
# optional_params[k] = v
## CONSTRUCT API BASE
stream: bool = optional_params.get("stream", False) or False
optional_params["stream"] = stream
api_base = self.create_vertex_llama3_url(
vertex_location=vertex_location or "us-central1",
vertex_project=vertex_project or project_id,
)
return openai_chat_completions.completion(
model=model,
messages=messages,
api_base=api_base,
api_key=access_token,
custom_prompt_dict=custom_prompt_dict,
model_response=model_response,
print_verbose=print_verbose,
logging_obj=logging_obj,
optional_params=optional_params,
acompletion=acompletion,
litellm_params=litellm_params,
logger_fn=logger_fn,
client=client,
timeout=timeout,
)
except Exception as e:
raise VertexAIError(status_code=500, message=str(e))

View file

@ -1033,7 +1033,7 @@ class VertexLLM(BaseLLM):
model=model, custom_llm_provider=_custom_llm_provider
)
except Exception as e:
verbose_logger.error(
verbose_logger.warning(
"Unable to identify if system message supported. Defaulting to 'False'. Received error message - {}\nAdd it here - https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json".format(
str(e)
)
@ -1189,7 +1189,7 @@ class VertexLLM(BaseLLM):
response.raise_for_status()
except httpx.HTTPStatusError as err:
error_code = err.response.status_code
raise VertexAIError(status_code=error_code, message=response.text)
raise VertexAIError(status_code=error_code, message=err.response.text)
except httpx.TimeoutException:
raise VertexAIError(status_code=408, message="Timeout error occurred.")

View file

@ -120,6 +120,7 @@ from .llms.prompt_templates.factory import (
)
from .llms.text_completion_codestral import CodestralTextCompletion
from .llms.triton import TritonChatCompletion
from .llms.vertex_ai_llama import VertexAILlama3
from .llms.vertex_httpx import VertexLLM
from .llms.watsonx import IBMWatsonXAI
from .types.llms.openai import HttpxBinaryResponseContent
@ -156,6 +157,7 @@ triton_chat_completions = TritonChatCompletion()
bedrock_chat_completion = BedrockLLM()
bedrock_converse_chat_completion = BedrockConverseLLM()
vertex_chat_completion = VertexLLM()
vertex_llama_chat_completion = VertexAILlama3()
watsonxai = IBMWatsonXAI()
####### COMPLETION ENDPOINTS ################
@ -375,6 +377,7 @@ async def acompletion(
or custom_llm_provider == "predibase"
or custom_llm_provider == "bedrock"
or custom_llm_provider == "databricks"
or custom_llm_provider == "triton"
or custom_llm_provider == "clarifai"
or custom_llm_provider == "watsonx"
or custom_llm_provider in litellm.openai_compatible_providers
@ -1491,6 +1494,10 @@ def completion(
or get_secret("ANTHROPIC_BASE_URL")
or "https://api.anthropic.com/v1/complete"
)
if api_base is not None and not api_base.endswith("/v1/complete"):
api_base += "/v1/complete"
response = anthropic_text_completions.completion(
model=model,
messages=messages,
@ -1517,6 +1524,10 @@ def completion(
or get_secret("ANTHROPIC_BASE_URL")
or "https://api.anthropic.com/v1/messages"
)
if api_base is not None and not api_base.endswith("/v1/messages"):
api_base += "/v1/messages"
response = anthropic_chat_completions.completion(
model=model,
messages=messages,
@ -2055,7 +2066,26 @@ def completion(
timeout=timeout,
client=client,
)
elif model.startswith("meta/"):
model_response = vertex_llama_chat_completion.completion(
model=model,
messages=messages,
model_response=model_response,
print_verbose=print_verbose,
optional_params=new_params,
litellm_params=litellm_params,
logger_fn=logger_fn,
encoding=encoding,
vertex_location=vertex_ai_location,
vertex_project=vertex_ai_project,
vertex_credentials=vertex_credentials,
logging_obj=logging,
acompletion=acompletion,
headers=headers,
custom_prompt_dict=custom_prompt_dict,
timeout=timeout,
client=client,
)
else:
model_response = vertex_ai.completion(
model=model,
@ -2469,6 +2499,25 @@ def completion(
return generator
response = generator
elif custom_llm_provider == "triton":
api_base = litellm.api_base or api_base
model_response = triton_chat_completions.completion(
api_base=api_base,
timeout=timeout, # type: ignore
model=model,
messages=messages,
model_response=model_response,
optional_params=optional_params,
logging_obj=logging,
stream=stream,
acompletion=acompletion,
)
## RESPONSE OBJECT
response = model_response
return response
elif custom_llm_provider == "cloudflare":
api_key = (
api_key

View file

@ -760,6 +760,36 @@
"litellm_provider": "azure_ai",
"mode": "chat"
},
"azure_ai/Meta-Llama-31-8B-Instruct": {
"max_tokens": 128000,
"max_input_tokens": 128000,
"max_output_tokens": 128000,
"input_cost_per_token": 0.0000003,
"output_cost_per_token": 0.00000061,
"litellm_provider": "azure_ai",
"mode": "chat",
"source":"https://azuremarketplace.microsoft.com/en-us/marketplace/apps/metagenai.meta-llama-3-1-8b-instruct-offer?tab=PlansAndPrice"
},
"azure_ai/Meta-Llama-31-70B-Instruct": {
"max_tokens": 128000,
"max_input_tokens": 128000,
"max_output_tokens": 128000,
"input_cost_per_token": 0.00000268,
"output_cost_per_token": 0.00000354,
"litellm_provider": "azure_ai",
"mode": "chat",
"source":"https://azuremarketplace.microsoft.com/en-us/marketplace/apps/metagenai.meta-llama-3-1-70b-instruct-offer?tab=PlansAndPrice"
},
"azure_ai/Meta-Llama-31-405B-Instruct": {
"max_tokens": 128000,
"max_input_tokens": 128000,
"max_output_tokens": 128000,
"input_cost_per_token": 0.00000533,
"output_cost_per_token": 0.000016,
"litellm_provider": "azure_ai",
"mode": "chat",
"source":"https://azuremarketplace.microsoft.com/en-us/marketplace/apps/metagenai.meta-llama-3-1-405b-instruct-offer?tab=PlansAndPrice"
},
"babbage-002": {
"max_tokens": 16384,
"max_input_tokens": 16384,
@ -1948,6 +1978,16 @@
"supports_function_calling": true,
"supports_vision": true
},
"vertex_ai/meta/llama3-405b-instruct-maas": {
"max_tokens": 32000,
"max_input_tokens": 32000,
"max_output_tokens": 32000,
"input_cost_per_token": 0.0,
"output_cost_per_token": 0.0,
"litellm_provider": "vertex_ai-llama_models",
"mode": "chat",
"source": "https://cloud.google.com/vertex-ai/generative-ai/pricing#partner-models"
},
"vertex_ai/imagegeneration@006": {
"cost_per_image": 0.020,
"litellm_provider": "vertex_ai-image-models",
@ -3633,6 +3673,24 @@
"litellm_provider": "bedrock",
"mode": "chat"
},
"meta.llama3-1-8b-instruct-v1:0": {
"max_tokens": 128000,
"max_input_tokens": 128000,
"max_output_tokens": 2048,
"input_cost_per_token": 0.0000004,
"output_cost_per_token": 0.0000006,
"litellm_provider": "bedrock",
"mode": "chat"
},
"meta.llama3-1-70b-instruct-v1:0": {
"max_tokens": 128000,
"max_input_tokens": 128000,
"max_output_tokens": 2048,
"input_cost_per_token": 0.00000265,
"output_cost_per_token": 0.0000035,
"litellm_provider": "bedrock",
"mode": "chat"
},
"512-x-512/50-steps/stability.stable-diffusion-xl-v0": {
"max_tokens": 77,
"max_input_tokens": 77,

View file

@ -1,5 +1,8 @@
model_list:
- model_name: groq-llama3
- model_name: "*" # all requests where model not in your config go to this deployment
litellm_params:
model: groq/llama3-groq-70b-8192-tool-use-preview
api_key: os.environ/GROQ_API_KEY
model: "openai/*" # passes our validation check that a real provider is given
api_key: ""
general_settings:
completion_model: "gpt-3.5-turbo"

View file

@ -228,6 +228,10 @@ class LiteLLMRoutes(enum.Enum):
"/utils/token_counter",
]
anthropic_routes: List = [
"/v1/messages",
]
info_routes: List = [
"/key/info",
"/team/info",
@ -880,6 +884,26 @@ class BlockTeamRequest(LiteLLMBase):
team_id: str # required
class AddTeamCallback(LiteLLMBase):
callback_name: str
callback_type: Literal["success", "failure", "success_and_failure"]
# for now - only supported for langfuse
callback_vars: Dict[
Literal["langfuse_public_key", "langfuse_secret_key", "langfuse_host"], str
]
class TeamCallbackMetadata(LiteLLMBase):
success_callback: Optional[List[str]] = []
failure_callback: Optional[List[str]] = []
# for now - only supported for langfuse
callback_vars: Optional[
Dict[
Literal["langfuse_public_key", "langfuse_secret_key", "langfuse_host"], str
]
] = {}
class LiteLLM_TeamTable(TeamBase):
spend: Optional[float] = None
max_parallel_requests: Optional[int] = None
@ -1232,6 +1256,7 @@ class LiteLLM_VerificationTokenView(LiteLLM_VerificationToken):
soft_budget: Optional[float] = None
team_model_aliases: Optional[Dict] = None
team_member_spend: Optional[float] = None
team_metadata: Optional[Dict] = None
# End User Params
end_user_id: Optional[str] = None
@ -1677,3 +1702,5 @@ class ProxyErrorTypes(str, enum.Enum):
budget_exceeded = "budget_exceeded"
expired_key = "expired_key"
auth_error = "auth_error"
internal_server_error = "internal_server_error"
bad_request_error = "bad_request_error"

View file

@ -24,7 +24,7 @@ from litellm.proxy._types import (
LitellmUserRoles,
UserAPIKeyAuth,
)
from litellm.proxy.auth.auth_utils import is_openai_route
from litellm.proxy.auth.auth_utils import is_llm_api_route
from litellm.proxy.utils import PrismaClient, ProxyLogging, log_to_opentelemetry
from litellm.types.services import ServiceLoggerPayload, ServiceTypes
@ -57,6 +57,7 @@ def common_checks(
4. If end_user (either via JWT or 'user' passed to /chat/completions, /embeddings endpoint) is in budget
5. [OPTIONAL] If 'enforce_end_user' enabled - did developer pass in 'user' param for openai endpoints
6. [OPTIONAL] If 'litellm.max_budget' is set (>0), is proxy under budget
7. [OPTIONAL] If guardrails modified - is request allowed to change this
"""
_model = request_body.get("model", None)
if team_object is not None and team_object.blocked is True:
@ -106,7 +107,7 @@ def common_checks(
general_settings.get("enforce_user_param", None) is not None
and general_settings["enforce_user_param"] == True
):
if is_openai_route(route=route) and "user" not in request_body:
if is_llm_api_route(route=route) and "user" not in request_body:
raise Exception(
f"'user' param not passed in. 'enforce_user_param'={general_settings['enforce_user_param']}"
)
@ -122,7 +123,7 @@ def common_checks(
+ CommonProxyErrors.not_premium_user.value
)
if is_openai_route(route=route):
if is_llm_api_route(route=route):
# loop through each enforced param
# example enforced_params ['user', 'metadata', 'metadata.generation_name']
for enforced_param in general_settings["enforced_params"]:
@ -150,7 +151,7 @@ def common_checks(
and global_proxy_spend is not None
# only run global budget checks for OpenAI routes
# Reason - the Admin UI should continue working if the proxy crosses it's global budget
and is_openai_route(route=route)
and is_llm_api_route(route=route)
and route != "/v1/models"
and route != "/models"
):
@ -158,6 +159,22 @@ def common_checks(
raise litellm.BudgetExceededError(
current_cost=global_proxy_spend, max_budget=litellm.max_budget
)
_request_metadata: dict = request_body.get("metadata", {}) or {}
if _request_metadata.get("guardrails"):
# check if team allowed to modify guardrails
from litellm.proxy.guardrails.guardrail_helpers import can_modify_guardrails
can_modify: bool = can_modify_guardrails(team_object)
if can_modify is False:
from fastapi import HTTPException
raise HTTPException(
status_code=403,
detail={
"error": "Your team does not have permission to modify guardrails."
},
)
return True

View file

@ -46,7 +46,7 @@ def route_in_additonal_public_routes(current_route: str):
return False
def is_openai_route(route: str) -> bool:
def is_llm_api_route(route: str) -> bool:
"""
Helper to checks if provided route is an OpenAI route
@ -59,6 +59,9 @@ def is_openai_route(route: str) -> bool:
if route in LiteLLMRoutes.openai_routes.value:
return True
if route in LiteLLMRoutes.anthropic_routes.value:
return True
# fuzzy match routes like "/v1/threads/thread_49EIN5QF32s4mH20M7GFKdlZ"
# Check for routes with placeholders
for openai_route in LiteLLMRoutes.openai_routes.value:

View file

@ -57,7 +57,7 @@ from litellm.proxy.auth.auth_checks import (
log_to_opentelemetry,
)
from litellm.proxy.auth.auth_utils import (
is_openai_route,
is_llm_api_route,
route_in_additonal_public_routes,
)
from litellm.proxy.common_utils.http_parsing_utils import _read_request_body
@ -924,6 +924,7 @@ async def user_api_key_auth(
rpm_limit=valid_token.team_rpm_limit,
blocked=valid_token.team_blocked,
models=valid_token.team_models,
metadata=valid_token.team_metadata,
)
user_api_key_cache.set_cache(
@ -994,9 +995,9 @@ async def user_api_key_auth(
_user_role = _get_user_role(user_id_information=user_id_information)
if not _is_user_proxy_admin(user_id_information): # if non-admin
if is_openai_route(route=route):
if is_llm_api_route(route=route):
pass
elif is_openai_route(route=request["route"].name):
elif is_llm_api_route(route=request["route"].name):
pass
elif (
route in LiteLLMRoutes.info_routes.value
@ -1049,7 +1050,7 @@ async def user_api_key_auth(
pass
elif _user_role == LitellmUserRoles.PROXY_ADMIN_VIEW_ONLY.value:
if is_openai_route(route=route):
if is_llm_api_route(route=route):
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail=f"user not allowed to access this OpenAI routes, role= {_user_role}",

View file

@ -23,11 +23,11 @@ def initialize_callbacks_on_proxy(
)
if isinstance(value, list):
imported_list: List[Any] = []
known_compatible_callbacks = list(
get_args(litellm._custom_logger_compatible_callbacks_literal)
)
for callback in value: # ["presidio", <my-custom-callback>]
if isinstance(callback, str) and callback in known_compatible_callbacks:
if (
isinstance(callback, str)
and callback in litellm._known_custom_logger_compatible_callbacks
):
imported_list.append(callback)
elif isinstance(callback, str) and callback == "otel":
from litellm.integrations.opentelemetry import OpenTelemetry

View file

@ -1,9 +1,26 @@
from typing import Dict
import litellm
from litellm._logging import verbose_proxy_logger
from litellm.proxy.proxy_server import UserAPIKeyAuth
from litellm.proxy.proxy_server import LiteLLM_TeamTable, UserAPIKeyAuth
from litellm.types.guardrails import *
def can_modify_guardrails(team_obj: Optional[LiteLLM_TeamTable]) -> bool:
if team_obj is None:
return True
team_metadata = team_obj.metadata or {}
if team_metadata.get("guardrails", None) is not None and isinstance(
team_metadata.get("guardrails"), Dict
):
if team_metadata.get("guardrails", {}).get("modify_guardrails", None) is False:
return False
return True
async def should_proceed_based_on_metadata(data: dict, guardrail_name: str) -> bool:
"""
checks if this guardrail should be applied to this call

View file

@ -4,7 +4,7 @@ from typing import TYPE_CHECKING, Any, Dict, Optional
from fastapi import Request
from litellm._logging import verbose_logger, verbose_proxy_logger
from litellm.proxy._types import CommonProxyErrors, UserAPIKeyAuth
from litellm.proxy._types import CommonProxyErrors, TeamCallbackMetadata, UserAPIKeyAuth
from litellm.types.utils import SupportedCacheControls
if TYPE_CHECKING:
@ -39,6 +39,9 @@ def _get_metadata_variable_name(request: Request) -> str:
"""
if "thread" in request.url.path or "assistant" in request.url.path:
return "litellm_metadata"
if "/v1/messages" in request.url.path:
# anthropic API has a field called metadata
return "litellm_metadata"
else:
return "metadata"
@ -207,6 +210,32 @@ async def add_litellm_data_to_request(
**data,
} # add the team-specific configs to the completion call
# Team Callbacks controls
if user_api_key_dict.team_metadata is not None:
team_metadata = user_api_key_dict.team_metadata
if "callback_settings" in team_metadata:
callback_settings = team_metadata.get("callback_settings", None) or {}
callback_settings_obj = TeamCallbackMetadata(**callback_settings)
verbose_proxy_logger.debug(
"Team callback settings activated: %s", callback_settings_obj
)
"""
callback_settings = {
{
'callback_vars': {'langfuse_public_key': 'pk', 'langfuse_secret_key': 'sk_'},
'failure_callback': [],
'success_callback': ['langfuse', 'langfuse']
}
}
"""
data["success_callback"] = callback_settings_obj.success_callback
data["failure_callback"] = callback_settings_obj.failure_callback
if callback_settings_obj.callback_vars is not None:
# unpack callback_vars in data
for k, v in callback_settings_obj.callback_vars.items():
data[k] = v
return data

View file

@ -333,6 +333,13 @@ async def update_key_fn(
expires = datetime.now(timezone.utc) + timedelta(seconds=duration_s)
non_default_values["expires"] = expires
if "budget_duration" in non_default_values:
duration_s = _duration_in_seconds(
duration=non_default_values["budget_duration"]
)
key_reset_at = datetime.now(timezone.utc) + timedelta(seconds=duration_s)
non_default_values["budget_reset_at"] = key_reset_at
response = await prisma_client.update_data(
token=key, data={**non_default_values, "token": key}
)

View file

@ -0,0 +1,364 @@
"""
Endpoints to control callbacks per team
Use this when each team should control its own callbacks
"""
import asyncio
import copy
import json
import traceback
import uuid
from datetime import datetime, timedelta, timezone
from typing import List, Optional
import fastapi
from fastapi import APIRouter, Depends, Header, HTTPException, Request, status
import litellm
from litellm._logging import verbose_proxy_logger
from litellm.proxy._types import (
AddTeamCallback,
LiteLLM_TeamTable,
ProxyErrorTypes,
ProxyException,
TeamCallbackMetadata,
UserAPIKeyAuth,
)
from litellm.proxy.auth.user_api_key_auth import user_api_key_auth
from litellm.proxy.management_helpers.utils import (
add_new_member,
management_endpoint_wrapper,
)
router = APIRouter()
@router.post(
"/team/{team_id:path}/callback",
tags=["team management"],
dependencies=[Depends(user_api_key_auth)],
)
@management_endpoint_wrapper
async def add_team_callbacks(
data: AddTeamCallback,
http_request: Request,
team_id: str,
user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
litellm_changed_by: Optional[str] = Header(
None,
description="The litellm-changed-by header enables tracking of actions performed by authorized users on behalf of other users, providing an audit trail for accountability",
),
):
"""
Add a success/failure callback to a team
Use this if if you want different teams to have different success/failure callbacks
Example curl:
```
curl -X POST 'http:/localhost:4000/team/dbe2f686-a686-4896-864a-4c3924458709/callback' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234' \
-d '{
"callback_name": "langfuse",
"callback_type": "success",
"callback_vars": {"langfuse_public_key": "pk-lf-xxxx1", "langfuse_secret_key": "sk-xxxxx"}
}'
```
This means for the team where team_id = dbe2f686-a686-4896-864a-4c3924458709, all LLM calls will be logged to langfuse using the public key pk-lf-xxxx1 and the secret key sk-xxxxx
"""
try:
from litellm.proxy.proxy_server import (
_duration_in_seconds,
create_audit_log_for_update,
litellm_proxy_admin_name,
prisma_client,
)
if prisma_client is None:
raise HTTPException(status_code=500, detail={"error": "No db connected"})
# Check if team_id exists already
_existing_team = await prisma_client.get_data(
team_id=team_id, table_name="team", query_type="find_unique"
)
if _existing_team is None:
raise HTTPException(
status_code=400,
detail={
"error": f"Team id = {team_id} does not exist. Please use a different team id."
},
)
# store team callback settings in metadata
team_metadata = _existing_team.metadata
team_callback_settings = team_metadata.get("callback_settings", {})
# expect callback settings to be
team_callback_settings_obj = TeamCallbackMetadata(**team_callback_settings)
if data.callback_type == "success":
if team_callback_settings_obj.success_callback is None:
team_callback_settings_obj.success_callback = []
if data.callback_name in team_callback_settings_obj.success_callback:
raise ProxyException(
message=f"callback_name = {data.callback_name} already exists in failure_callback, for team_id = {team_id}. \n Existing failure_callback = {team_callback_settings_obj.success_callback}",
code=status.HTTP_400_BAD_REQUEST,
type=ProxyErrorTypes.bad_request_error,
param="callback_name",
)
team_callback_settings_obj.success_callback.append(data.callback_name)
elif data.callback_type == "failure":
if team_callback_settings_obj.failure_callback is None:
team_callback_settings_obj.failure_callback = []
if data.callback_name in team_callback_settings_obj.failure_callback:
raise ProxyException(
message=f"callback_name = {data.callback_name} already exists in failure_callback, for team_id = {team_id}. \n Existing failure_callback = {team_callback_settings_obj.failure_callback}",
code=status.HTTP_400_BAD_REQUEST,
type=ProxyErrorTypes.bad_request_error,
param="callback_name",
)
team_callback_settings_obj.failure_callback.append(data.callback_name)
elif data.callback_type == "success_and_failure":
if team_callback_settings_obj.success_callback is None:
team_callback_settings_obj.success_callback = []
if team_callback_settings_obj.failure_callback is None:
team_callback_settings_obj.failure_callback = []
if data.callback_name in team_callback_settings_obj.success_callback:
raise ProxyException(
message=f"callback_name = {data.callback_name} already exists in success_callback, for team_id = {team_id}. \n Existing success_callback = {team_callback_settings_obj.success_callback}",
code=status.HTTP_400_BAD_REQUEST,
type=ProxyErrorTypes.bad_request_error,
param="callback_name",
)
if data.callback_name in team_callback_settings_obj.failure_callback:
raise ProxyException(
message=f"callback_name = {data.callback_name} already exists in failure_callback, for team_id = {team_id}. \n Existing failure_callback = {team_callback_settings_obj.failure_callback}",
code=status.HTTP_400_BAD_REQUEST,
type=ProxyErrorTypes.bad_request_error,
param="callback_name",
)
team_callback_settings_obj.success_callback.append(data.callback_name)
team_callback_settings_obj.failure_callback.append(data.callback_name)
for var, value in data.callback_vars.items():
if team_callback_settings_obj.callback_vars is None:
team_callback_settings_obj.callback_vars = {}
team_callback_settings_obj.callback_vars[var] = value
team_callback_settings_obj_dict = team_callback_settings_obj.model_dump()
team_metadata["callback_settings"] = team_callback_settings_obj_dict
team_metadata_json = json.dumps(team_metadata) # update team_metadata
new_team_row = await prisma_client.db.litellm_teamtable.update(
where={"team_id": team_id}, data={"metadata": team_metadata_json} # type: ignore
)
return {
"status": "success",
"data": new_team_row,
}
except Exception as e:
verbose_proxy_logger.error(
"litellm.proxy.proxy_server.add_team_callbacks(): Exception occured - {}".format(
str(e)
)
)
verbose_proxy_logger.debug(traceback.format_exc())
if isinstance(e, HTTPException):
raise ProxyException(
message=getattr(e, "detail", f"Internal Server Error({str(e)})"),
type=ProxyErrorTypes.internal_server_error.value,
param=getattr(e, "param", "None"),
code=getattr(e, "status_code", status.HTTP_500_INTERNAL_SERVER_ERROR),
)
elif isinstance(e, ProxyException):
raise e
raise ProxyException(
message="Internal Server Error, " + str(e),
type=ProxyErrorTypes.internal_server_error.value,
param=getattr(e, "param", "None"),
code=status.HTTP_500_INTERNAL_SERVER_ERROR,
)
@router.post(
"/team/{team_id}/disable_logging",
tags=["team management"],
dependencies=[Depends(user_api_key_auth)],
)
@management_endpoint_wrapper
async def disable_team_logging(
http_request: Request,
team_id: str,
user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
):
try:
from litellm.proxy.proxy_server import prisma_client
if prisma_client is None:
raise HTTPException(status_code=500, detail={"error": "No db connected"})
# Check if team exists
_existing_team = await prisma_client.get_data(
team_id=team_id, table_name="team", query_type="find_unique"
)
if _existing_team is None:
raise HTTPException(
status_code=404,
detail={"error": f"Team id = {team_id} does not exist."},
)
# Update team metadata to disable logging
team_metadata = _existing_team.metadata
team_callback_settings = team_metadata.get("callback_settings", {})
team_callback_settings_obj = TeamCallbackMetadata(**team_callback_settings)
# Reset callbacks
team_callback_settings_obj.success_callback = []
team_callback_settings_obj.failure_callback = []
# Update metadata
team_metadata["callback_settings"] = team_callback_settings_obj.model_dump()
team_metadata_json = json.dumps(team_metadata)
# Update team in database
updated_team = await prisma_client.db.litellm_teamtable.update(
where={"team_id": team_id}, data={"metadata": team_metadata_json} # type: ignore
)
if updated_team is None:
raise HTTPException(
status_code=404,
detail={
"error": f"Team id = {team_id} does not exist. Error updating team logging"
},
)
return {
"status": "success",
"message": f"Logging disabled for team {team_id}",
"data": {
"team_id": updated_team.team_id,
"success_callbacks": [],
"failure_callbacks": [],
},
}
except Exception as e:
verbose_proxy_logger.error(
f"litellm.proxy.proxy_server.disable_team_logging(): Exception occurred - {str(e)}"
)
verbose_proxy_logger.debug(traceback.format_exc())
if isinstance(e, HTTPException):
raise ProxyException(
message=getattr(e, "detail", f"Internal Server Error({str(e)})"),
type=ProxyErrorTypes.internal_server_error.value,
param=getattr(e, "param", "None"),
code=getattr(e, "status_code", status.HTTP_500_INTERNAL_SERVER_ERROR),
)
elif isinstance(e, ProxyException):
raise e
raise ProxyException(
message="Internal Server Error, " + str(e),
type=ProxyErrorTypes.internal_server_error.value,
param=getattr(e, "param", "None"),
code=status.HTTP_500_INTERNAL_SERVER_ERROR,
)
@router.get(
"/team/{team_id:path}/callback",
tags=["team management"],
dependencies=[Depends(user_api_key_auth)],
)
@management_endpoint_wrapper
async def get_team_callbacks(
http_request: Request,
team_id: str,
user_api_key_dict: UserAPIKeyAuth = Depends(user_api_key_auth),
):
"""
Get the success/failure callbacks and variables for a team
Example curl:
```
curl -X GET 'http://localhost:4000/team/dbe2f686-a686-4896-864a-4c3924458709/callback' \
-H 'Authorization: Bearer sk-1234'
```
This will return the callback settings for the team with id dbe2f686-a686-4896-864a-4c3924458709
Returns {
"status": "success",
"data": {
"team_id": team_id,
"success_callbacks": team_callback_settings_obj.success_callback,
"failure_callbacks": team_callback_settings_obj.failure_callback,
"callback_vars": team_callback_settings_obj.callback_vars,
},
}
"""
try:
from litellm.proxy.proxy_server import prisma_client
if prisma_client is None:
raise HTTPException(status_code=500, detail={"error": "No db connected"})
# Check if team_id exists
_existing_team = await prisma_client.get_data(
team_id=team_id, table_name="team", query_type="find_unique"
)
if _existing_team is None:
raise HTTPException(
status_code=404,
detail={"error": f"Team id = {team_id} does not exist."},
)
# Retrieve team callback settings from metadata
team_metadata = _existing_team.metadata
team_callback_settings = team_metadata.get("callback_settings", {})
# Convert to TeamCallbackMetadata object for consistent structure
team_callback_settings_obj = TeamCallbackMetadata(**team_callback_settings)
return {
"status": "success",
"data": {
"team_id": team_id,
"success_callbacks": team_callback_settings_obj.success_callback,
"failure_callbacks": team_callback_settings_obj.failure_callback,
"callback_vars": team_callback_settings_obj.callback_vars,
},
}
except Exception as e:
verbose_proxy_logger.error(
"litellm.proxy.proxy_server.get_team_callbacks(): Exception occurred - {}".format(
str(e)
)
)
verbose_proxy_logger.debug(traceback.format_exc())
if isinstance(e, HTTPException):
raise ProxyException(
message=getattr(e, "detail", f"Internal Server Error({str(e)})"),
type=ProxyErrorTypes.internal_server_error.value,
param=getattr(e, "param", "None"),
code=getattr(e, "status_code", status.HTTP_500_INTERNAL_SERVER_ERROR),
)
elif isinstance(e, ProxyException):
raise e
raise ProxyException(
message="Internal Server Error, " + str(e),
type=ProxyErrorTypes.internal_server_error.value,
param=getattr(e, "param", "None"),
code=status.HTTP_500_INTERNAL_SERVER_ERROR,
)

View file

@ -363,6 +363,7 @@ async def update_team(
# set the budget_reset_at in DB
updated_kv["budget_reset_at"] = reset_at
updated_kv = prisma_client.jsonify_object(data=updated_kv)
team_row: Optional[
LiteLLM_TeamTable
] = await prisma_client.db.litellm_teamtable.update(

View file

@ -1,10 +1,21 @@
model_list:
- model_name: gpt-4
litellm_params:
model: openai/fake
api_key: fake-key
api_base: https://exampleopenaiendpoint-production.up.railway.app/
- model_name: fireworks-llama-v3-70b-instruct
litellm_params:
model: fireworks_ai/accounts/fireworks/models/llama-v3-70b-instruct
api_key: "os.environ/FIREWORKS_AI_API_KEY"
router_settings:
enable_tag_filtering: True # 👈 Key Change
api_key: "os.environ/FIREWORKS"
general_settings:
master_key: sk-1234
alerting: ["slack"]
alerting_threshold: 0.0001
alert_to_webhook_url: {
"llm_too_slow": "https://hooks.slack.com/services/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",
"llm_requests_hanging": "https://hooks.slack.com/services/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",
}
litellm_settings:
success_callback: ["langfuse"]

View file

@ -170,6 +170,9 @@ from litellm.proxy.management_endpoints.key_management_endpoints import (
from litellm.proxy.management_endpoints.key_management_endpoints import (
router as key_management_router,
)
from litellm.proxy.management_endpoints.team_callback_endpoints import (
router as team_callback_router,
)
from litellm.proxy.management_endpoints.team_endpoints import router as team_router
from litellm.proxy.openai_files_endpoints.files_endpoints import (
router as openai_files_router,
@ -654,7 +657,11 @@ async def _PROXY_track_cost_callback(
global prisma_client, custom_db_client
try:
# check if it has collected an entire stream response
verbose_proxy_logger.debug("Proxy: In track_cost_callback for: %s", kwargs)
verbose_proxy_logger.debug(
"Proxy: In track_cost_callback for: kwargs=%s and completion_response: %s",
kwargs,
completion_response,
)
verbose_proxy_logger.debug(
f"kwargs stream: {kwargs.get('stream', None)} + complete streaming response: {kwargs.get('complete_streaming_response', None)}"
)
@ -1620,6 +1627,7 @@ class ProxyConfig:
alerting=general_settings.get("alerting", None),
alerting_threshold=general_settings.get("alerting_threshold", 600),
alert_types=general_settings.get("alert_types", None),
alert_to_webhook_url=general_settings.get("alert_to_webhook_url", None),
alerting_args=general_settings.get("alerting_args", None),
redis_cache=redis_usage_cache,
)
@ -2905,6 +2913,7 @@ async def chat_completion(
fastest_response_batch_completion = hidden_params.get(
"fastest_response_batch_completion", None
)
additional_headers: dict = hidden_params.get("additional_headers", {}) or {}
# Post Call Processing
if llm_router is not None:
@ -2927,6 +2936,7 @@ async def chat_completion(
response_cost=response_cost,
model_region=getattr(user_api_key_dict, "allowed_model_region", ""),
fastest_response_batch_completion=fastest_response_batch_completion,
**additional_headers,
)
selected_data_generator = select_data_generator(
response=response,
@ -2944,8 +2954,10 @@ async def chat_completion(
user_api_key_dict=user_api_key_dict, response=response
)
hidden_params = getattr(response, "_hidden_params", {}) or {}
additional_headers: dict = hidden_params.get("additional_headers", {}) or {}
hidden_params = (
getattr(response, "_hidden_params", {}) or {}
) # get any updated response headers
additional_headers = hidden_params.get("additional_headers", {}) or {}
fastapi_response.headers.update(
get_custom_headers(
@ -9457,3 +9469,4 @@ app.include_router(analytics_router)
app.include_router(debugging_endpoints_router)
app.include_router(ui_crud_endpoints_router)
app.include_router(openai_files_router)
app.include_router(team_callback_router)

View file

@ -183,12 +183,12 @@ model LiteLLM_SpendLogs {
model String @default("")
model_id String? @default("") // the model id stored in proxy model db
model_group String? @default("") // public model_name / model_group
api_base String @default("")
user String @default("")
metadata Json @default("{}")
cache_hit String @default("")
cache_key String @default("")
request_tags Json @default("[]")
api_base String? @default("")
user String? @default("")
metadata Json? @default("{}")
cache_hit String? @default("")
cache_key String? @default("")
request_tags Json? @default("[]")
team_id String?
end_user String?
requester_ip_address String?

View file

@ -0,0 +1,22 @@
import os
from anthropic import Anthropic
client = Anthropic(
# This is the default and can be omitted
base_url="http://localhost:4000",
# this is a litellm proxy key :) - not a real anthropic key
api_key="sk-s4xN1IiLTCytwtZFJaYQrA",
)
message = client.messages.create(
max_tokens=1024,
messages=[
{
"role": "user",
"content": "Hello, Claude",
}
],
model="claude-3-opus-20240229",
)
print(message.content)

View file

@ -25,7 +25,7 @@ from typing_extensions import overload
import litellm
import litellm.litellm_core_utils
import litellm.litellm_core_utils.litellm_logging
from litellm import EmbeddingResponse, ImageResponse, ModelResponse
from litellm import EmbeddingResponse, ImageResponse, ModelResponse, get_litellm_params
from litellm._logging import verbose_proxy_logger
from litellm._service_logger import ServiceLogging, ServiceTypes
from litellm.caching import DualCache, RedisCache
@ -50,7 +50,7 @@ from litellm.proxy.hooks.max_budget_limiter import _PROXY_MaxBudgetLimiter
from litellm.proxy.hooks.parallel_request_limiter import (
_PROXY_MaxParallelRequestsHandler,
)
from litellm.types.utils import CallTypes
from litellm.types.utils import CallTypes, LoggedLiteLLMParams
if TYPE_CHECKING:
from opentelemetry.trace import Span as _Span
@ -188,6 +188,7 @@ class ProxyLogging:
"new_model_added",
"outage_alerts",
]
self.alert_to_webhook_url: Optional[dict] = None
self.slack_alerting_instance: SlackAlerting = SlackAlerting(
alerting_threshold=self.alerting_threshold,
alerting=self.alerting,
@ -202,6 +203,7 @@ class ProxyLogging:
redis_cache: Optional[RedisCache] = None,
alert_types: Optional[List[AlertType]] = None,
alerting_args: Optional[dict] = None,
alert_to_webhook_url: Optional[dict] = None,
):
updated_slack_alerting: bool = False
if alerting is not None:
@ -213,6 +215,9 @@ class ProxyLogging:
if alert_types is not None:
self.alert_types = alert_types
updated_slack_alerting = True
if alert_to_webhook_url is not None:
self.alert_to_webhook_url = alert_to_webhook_url
updated_slack_alerting = True
if updated_slack_alerting is True:
self.slack_alerting_instance.update_values(
@ -220,6 +225,7 @@ class ProxyLogging:
alerting_threshold=self.alerting_threshold,
alert_types=self.alert_types,
alerting_args=alerting_args,
alert_to_webhook_url=self.alert_to_webhook_url,
)
if (
@ -602,14 +608,20 @@ class ProxyLogging:
if litellm_logging_obj is not None:
## UPDATE LOGGING INPUT
_optional_params = {}
_litellm_params = {}
litellm_param_keys = LoggedLiteLLMParams.__annotations__.keys()
for k, v in request_data.items():
if k != "model" and k != "user" and k != "litellm_params":
if k in litellm_param_keys:
_litellm_params[k] = v
elif k != "model" and k != "user":
_optional_params[k] = v
litellm_logging_obj.update_environment_variables(
model=request_data.get("model", ""),
user=request_data.get("user", ""),
optional_params=_optional_params,
litellm_params=request_data.get("litellm_params", {}),
litellm_params=_litellm_params,
)
input: Union[list, str, dict] = ""
@ -832,6 +844,30 @@ class PrismaClient:
If the view doesn't exist, one will be created.
"""
# Check to see if all of the necessary views exist and if they do, simply return
# This is more efficient because it lets us check for all views in one
# query instead of multiple queries.
try:
ret = await self.db.query_raw(
"""
SELECT SUM(1) FROM pg_views
WHERE schemaname = 'public' AND viewname IN (
'LiteLLM_VerificationTokenView',
'MonthlyGlobalSpend',
'Last30dKeysBySpend',
'Last30dModelsBySpend',
'MonthlyGlobalSpendPerKey',
'Last30dTopEndUsersSpend'
)
"""
)
if ret[0]['sum'] == 6:
print("All necessary views exist!") # noqa
return
except Exception:
pass
try:
# Try to select one row from the view
await self.db.query_raw(
@ -1313,8 +1349,10 @@ class PrismaClient:
t.tpm_limit AS team_tpm_limit,
t.rpm_limit AS team_rpm_limit,
t.models AS team_models,
t.metadata AS team_metadata,
t.blocked AS team_blocked,
t.team_alias AS team_alias,
t.metadata AS team_metadata,
tm.spend AS team_member_spend,
m.aliases as team_model_aliases
FROM "LiteLLM_VerificationToken" AS v

View file

@ -895,6 +895,52 @@ async def test_gemini_pro_function_calling_httpx(model, sync_mode):
pytest.fail("An unexpected exception occurred - {}".format(str(e)))
from litellm.tests.test_completion import response_format_tests
@pytest.mark.parametrize(
"model", ["vertex_ai/meta/llama3-405b-instruct-maas"]
) # "vertex_ai",
@pytest.mark.parametrize("sync_mode", [True, False]) # "vertex_ai",
@pytest.mark.asyncio
async def test_llama_3_httpx(model, sync_mode):
try:
load_vertex_ai_credentials()
litellm.set_verbose = True
messages = [
{
"role": "system",
"content": "Your name is Litellm Bot, you are a helpful assistant",
},
# User asks for their name and weather in San Francisco
{
"role": "user",
"content": "Hello, what is your name and can you tell me the weather?",
},
]
data = {
"model": model,
"messages": messages,
}
if sync_mode:
response = litellm.completion(**data)
else:
response = await litellm.acompletion(**data)
response_format_tests(response=response)
print(f"response: {response}")
except litellm.RateLimitError as e:
pass
except Exception as e:
if "429 Quota exceeded" in str(e):
pass
else:
pytest.fail("An unexpected exception occurred - {}".format(str(e)))
def vertex_httpx_mock_reject_prompt_post(*args, **kwargs):
mock_response = MagicMock()
mock_response.status_code = 200

View file

@ -48,6 +48,42 @@ def test_anthropic_completion_input_translation():
]
def test_anthropic_completion_input_translation_with_metadata():
"""
Tests that cost tracking works as expected with LiteLLM Proxy
LiteLLM Proxy will insert litellm_metadata for anthropic endpoints to track user_api_key and user_api_key_team_id
This test ensures that the `litellm_metadata` is not present in the translated input
It ensures that `litellm.acompletion()` will receieve metadata which is a litellm specific param
"""
data = {
"model": "gpt-3.5-turbo",
"messages": [{"role": "user", "content": "Hey, how's it going?"}],
"litellm_metadata": {
"user_api_key": "88dc28d0f030c55ed4ab77ed8faf098196cb1c05df778539800c9f1243fe6b4b",
"user_api_key_alias": None,
"user_api_end_user_max_budget": None,
"litellm_api_version": "1.40.19",
"global_max_parallel_requests": None,
"user_api_key_user_id": "default_user_id",
"user_api_key_org_id": None,
"user_api_key_team_id": None,
"user_api_key_team_alias": None,
"user_api_key_team_max_budget": None,
"user_api_key_team_spend": None,
"user_api_key_spend": 0.0,
"user_api_key_max_budget": None,
"user_api_key_metadata": {},
},
}
translated_input = anthropic_adapter.translate_completion_input_params(kwargs=data)
assert "litellm_metadata" not in translated_input
assert "metadata" in translated_input
assert translated_input["metadata"] == data["litellm_metadata"]
def test_anthropic_completion_e2e():
litellm.set_verbose = True

View file

@ -0,0 +1,29 @@
import asyncio
import logging
import os
import time
import pytest
from dotenv import load_dotenv
from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter
import litellm
from litellm._logging import verbose_logger
from litellm.integrations.opentelemetry import OpenTelemetry, OpenTelemetryConfig
load_dotenv()
import logging
@pytest.mark.asyncio()
async def test_async_otel_callback():
litellm.set_verbose = True
litellm.success_callback = ["arize"]
await litellm.acompletion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "hi test from local arize"}],
mock_response="hello",
temperature=0.1,
user="OTEL_USER",
)

View file

@ -2,18 +2,19 @@
# This tests chaos monkeys - if random parts of the system are broken / things aren't sent correctly - what happens.
# Expect to add more edge cases to this over time.
import sys, os
import os
import sys
import traceback
import pytest
sys.path.insert(
0, os.path.abspath("../..")
) # Adds the parent directory to the system path
import litellm
from litellm import embedding, completion
from litellm import completion, embedding
from litellm.utils import Message
# litellm.set_verbose = True
user_message = "Hello, how are you?"
messages = [{"content": user_message, "role": "user"}]
@ -74,6 +75,8 @@ def test_completion_invalid_param_cohere():
response = completion(model="command-nightly", messages=messages, seed=12)
pytest.fail(f"This should have failed cohere does not support `seed` parameter")
except Exception as e:
assert isinstance(e, litellm.UnsupportedParamsError)
print("got an exception=", str(e))
if " cohere does not support parameters: {'seed': 12}" in str(e):
pass
else:

View file

@ -0,0 +1,53 @@
# What is this?
## This tests the braintrust integration
import asyncio
import os
import random
import sys
import time
import traceback
from datetime import datetime
from dotenv import load_dotenv
from fastapi import Request
load_dotenv()
import os
sys.path.insert(
0, os.path.abspath("../..")
) # Adds the parent directory to the system path
import asyncio
import logging
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
import litellm
from litellm.llms.custom_httpx.http_handler import HTTPHandler
def test_braintrust_logging():
import litellm
http_client = HTTPHandler()
setattr(
litellm.integrations.braintrust_logging,
"global_braintrust_sync_http_handler",
http_client,
)
with patch.object(http_client, "post", new=MagicMock()) as mock_client:
# set braintrust as a callback, litellm will send the data to braintrust
litellm.callbacks = ["braintrust"]
# openai call
response = litellm.completion(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Hi 👋 - i'm openai"}],
)
mock_client.assert_called()

View file

@ -346,7 +346,7 @@ def test_completion_claude_3_empty_response():
messages = [
{
"role": "system",
"content": "You are 2twNLGfqk4GMOn3ffp4p.",
"content": [{"type": "text", "text": "You are 2twNLGfqk4GMOn3ffp4p."}],
},
{"role": "user", "content": "Hi gm!", "name": "ishaan"},
{"role": "assistant", "content": "Good morning! How are you doing today?"},
@ -1364,6 +1364,12 @@ def test_completion_openai_response_headers():
print("response_headers=", response._response_headers)
assert response._response_headers is not None
assert "x-ratelimit-remaining-tokens" in response._response_headers
assert isinstance(
response._hidden_params["additional_headers"][
"llm_provider-x-ratelimit-remaining-requests"
],
str,
)
# /chat/completion - with streaming
@ -1376,6 +1382,12 @@ def test_completion_openai_response_headers():
print("streaming response_headers=", response_headers)
assert response_headers is not None
assert "x-ratelimit-remaining-tokens" in response_headers
assert isinstance(
response._hidden_params["additional_headers"][
"llm_provider-x-ratelimit-remaining-requests"
],
str,
)
for chunk in streaming_response:
print("chunk=", chunk)
@ -1390,6 +1402,12 @@ def test_completion_openai_response_headers():
print("embedding_response_headers=", embedding_response_headers)
assert embedding_response_headers is not None
assert "x-ratelimit-remaining-tokens" in embedding_response_headers
assert isinstance(
response._hidden_params["additional_headers"][
"llm_provider-x-ratelimit-remaining-requests"
],
str,
)
litellm.return_response_headers = False
@ -2542,6 +2560,71 @@ def test_completion_anyscale_with_functions():
# test_completion_anyscale_with_functions()
def test_completion_azure_extra_headers():
# this tests if we can pass api_key to completion, when it's not in the env.
# DO NOT REMOVE THIS TEST. No MATTER WHAT Happens!
# If you want to remove it, speak to Ishaan!
# Ishaan will be very disappointed if this test is removed -> this is a standard way to pass api_key + the router + proxy use this
from httpx import Client
from openai import AzureOpenAI
from litellm.llms.custom_httpx.httpx_handler import HTTPHandler
http_client = Client()
with patch.object(http_client, "send", new=MagicMock()) as mock_client:
litellm.client_session = http_client
try:
response = completion(
model="azure/chatgpt-v-2",
messages=messages,
api_base=os.getenv("AZURE_API_BASE"),
api_version="2023-07-01-preview",
api_key=os.getenv("AZURE_API_KEY"),
extra_headers={
"Authorization": "my-bad-key",
"Ocp-Apim-Subscription-Key": "hello-world-testing",
},
)
print(response)
pytest.fail("Expected this to fail")
except Exception as e:
pass
mock_client.assert_called()
print(f"mock_client.call_args: {mock_client.call_args}")
request = mock_client.call_args[0][0]
print(request.method) # This will print 'POST'
print(request.url) # This will print the full URL
print(request.headers) # This will print the full URL
auth_header = request.headers.get("Authorization")
apim_key = request.headers.get("Ocp-Apim-Subscription-Key")
print(auth_header)
assert auth_header == "my-bad-key"
assert apim_key == "hello-world-testing"
def test_completion_azure_ad_token():
# this tests if we can pass api_key to completion, when it's not in the env.
# DO NOT REMOVE THIS TEST. No MATTER WHAT Happens!
# If you want to remove it, speak to Ishaan!
# Ishaan will be very disappointed if this test is removed -> this is a standard way to pass api_key + the router + proxy use this
from httpx import Client
from openai import AzureOpenAI
from litellm import completion
from litellm.llms.custom_httpx.httpx_handler import HTTPHandler
response = completion(
model="azure/chatgpt-v-2",
messages=messages,
# api_key="my-fake-ad-token",
azure_ad_token=os.getenv("AZURE_API_KEY"),
)
print(response)
def test_completion_azure_key_completion_arg():
# this tests if we can pass api_key to completion, when it's not in the env.
# DO NOT REMOVE THIS TEST. No MATTER WHAT Happens!

View file

@ -881,6 +881,7 @@ def test_completion_azure_ai():
@pytest.mark.parametrize("sync_mode", [True, False])
@pytest.mark.asyncio
async def test_completion_cost_hidden_params(sync_mode):
litellm.return_response_headers = True
if sync_mode:
response = litellm.completion(
model="gpt-3.5-turbo",

View file

@ -235,6 +235,7 @@ class CompletionCustomHandler(
assert isinstance(kwargs["optional_params"], dict)
assert isinstance(kwargs["litellm_params"], dict)
assert isinstance(kwargs["litellm_params"]["metadata"], Optional[dict])
assert isinstance(kwargs["start_time"], (datetime, type(None)))
assert isinstance(kwargs["stream"], bool)
assert isinstance(kwargs["user"], (str, type(None)))

View file

@ -197,6 +197,29 @@ def test_openai_azure_embedding():
pytest.fail(f"Error occurred: {e}")
@pytest.mark.skipif(
os.environ.get("CIRCLE_OIDC_TOKEN") is None,
reason="Cannot run without being in CircleCI Runner",
)
def test_openai_azure_embedding_with_oidc_and_cf():
# TODO: Switch to our own Azure account, currently using ai.moda's account
os.environ["AZURE_TENANT_ID"] = "17c0a27a-1246-4aa1-a3b6-d294e80e783c"
os.environ["AZURE_CLIENT_ID"] = "4faf5422-b2bd-45e8-a6d7-46543a38acd0"
try:
response = embedding(
model="azure/text-embedding-ada-002",
input=["Hello"],
azure_ad_token="oidc/circleci/",
api_base="https://eastus2-litellm.openai.azure.com/",
api_version="2024-06-01",
)
print(response)
except Exception as e:
pytest.fail(f"Error occurred: {e}")
def test_openai_azure_embedding_optional_arg(mocker):
mocked_create_embeddings = mocker.patch.object(
openai.resources.embeddings.Embeddings,
@ -650,3 +673,17 @@ async def test_databricks_embeddings(sync_mode):
# print(response)
# local_proxy_embeddings()
def test_embedding_azure_ad_token():
# this tests if we can pass api_key to completion, when it's not in the env.
# DO NOT REMOVE THIS TEST. No MATTER WHAT Happens!
# If you want to remove it, speak to Ishaan!
# Ishaan will be very disappointed if this test is removed -> this is a standard way to pass api_key + the router + proxy use this
response = embedding(
model="azure/azure-embedding-model",
input=["good morning from litellm"],
azure_ad_token=os.getenv("AZURE_API_KEY"),
)
print(response)

View file

@ -64,6 +64,30 @@ async def test_content_policy_exception_azure():
pytest.fail(f"An exception occurred - {str(e)}")
@pytest.mark.asyncio
async def test_content_policy_exception_openai():
try:
# this is ony a test - we needed some way to invoke the exception :(
litellm.set_verbose = True
response = await litellm.acompletion(
model="gpt-3.5-turbo-0613",
stream=True,
messages=[
{"role": "user", "content": "Gimme the lyrics to Don't Stop Me Now"}
],
)
async for chunk in response:
print(chunk)
except litellm.ContentPolicyViolationError as e:
print("caught a content policy violation error! Passed")
print("exception", e)
assert e.llm_provider == "openai"
pass
except Exception as e:
print()
pytest.fail(f"An exception occurred - {str(e)}")
# Test 1: Context Window Errors
@pytest.mark.skip(reason="AWS Suspended Account")
@pytest.mark.parametrize("model", exception_models)

View file

@ -36,6 +36,7 @@ async def test_async_langsmith_logging():
temperature=0.2,
metadata={
"id": run_id,
"tags": ["tag1", "tag2"],
"user_api_key": "6eb81e014497d89f3cc1aa9da7c2b37bda6b7fea68e4b710d33d94201e68970c",
"user_api_key_alias": "ishaans-langmsith-key",
"user_api_end_user_max_budget": None,

View file

@ -128,6 +128,19 @@ def test_azure_ai_mistral_optional_params():
assert "user" not in optional_params
def test_vertex_ai_llama_3_optional_params():
litellm.vertex_llama3_models = ["meta/llama3-405b-instruct-maas"]
litellm.drop_params = True
optional_params = get_optional_params(
model="meta/llama3-405b-instruct-maas",
user="John",
custom_llm_provider="vertex_ai",
max_tokens=10,
temperature=0.2,
)
assert "user" not in optional_params
def test_azure_gpt_optional_params_gpt_vision():
# for OpenAI, Azure all extra params need to get passed as extra_body to OpenAI python. We assert we actually set extra_body here
optional_params = litellm.utils.get_optional_params(

View file

@ -212,7 +212,7 @@ def test_convert_url_to_img():
[
("data:image/jpeg;base64,1234", "image/jpeg"),
("data:application/pdf;base64,1234", "application/pdf"),
("data:image\/jpeg;base64,1234", "image/jpeg"),
(r"data:image\/jpeg;base64,1234", "image/jpeg"),
],
)
def test_base64_image_input(url, expected_media_type):

View file

@ -19,7 +19,7 @@ import pytest
import litellm
from litellm.proxy._types import LiteLLMRoutes
from litellm.proxy.auth.auth_utils import is_openai_route
from litellm.proxy.auth.auth_utils import is_llm_api_route
from litellm.proxy.proxy_server import app
# Configure logging
@ -77,8 +77,8 @@ def test_routes_on_litellm_proxy():
("/v1/non_existent_endpoint", False),
],
)
def test_is_openai_route(route: str, expected: bool):
assert is_openai_route(route) == expected
def test_is_llm_api_route(route: str, expected: bool):
assert is_llm_api_route(route) == expected
# Test case for routes that are similar but should return False
@ -91,5 +91,10 @@ def test_is_openai_route(route: str, expected: bool):
"/engines/model/invalid/completions",
],
)
def test_is_openai_route_similar_but_false(route: str):
assert is_openai_route(route) == False
def test_is_llm_api_route_similar_but_false(route: str):
assert is_llm_api_route(route) == False
def test_anthropic_api_routes():
# allow non proxy admins to call anthropic api routes
assert is_llm_api_route(route="/v1/messages") is True

View file

@ -173,6 +173,63 @@ def test_chat_completion(mock_acompletion, client_no_auth):
pytest.fail(f"LiteLLM Proxy test failed. Exception - {str(e)}")
@mock_patch_acompletion()
@pytest.mark.asyncio
async def test_team_disable_guardrails(mock_acompletion, client_no_auth):
"""
If team not allowed to turn on/off guardrails
Raise 403 forbidden error, if request is made by team on `/key/generate` or `/chat/completions`.
"""
import asyncio
import json
import time
from fastapi import HTTPException, Request
from starlette.datastructures import URL
from litellm.proxy._types import LiteLLM_TeamTable, ProxyException, UserAPIKeyAuth
from litellm.proxy.auth.user_api_key_auth import user_api_key_auth
from litellm.proxy.proxy_server import hash_token, user_api_key_cache
_team_id = "1234"
user_key = "sk-12345678"
valid_token = UserAPIKeyAuth(
team_id=_team_id,
team_blocked=True,
token=hash_token(user_key),
last_refreshed_at=time.time(),
)
await asyncio.sleep(1)
team_obj = LiteLLM_TeamTable(
team_id=_team_id,
blocked=False,
last_refreshed_at=time.time(),
metadata={"guardrails": {"modify_guardrails": False}},
)
user_api_key_cache.set_cache(key=hash_token(user_key), value=valid_token)
user_api_key_cache.set_cache(key="team_id:{}".format(_team_id), value=team_obj)
setattr(litellm.proxy.proxy_server, "user_api_key_cache", user_api_key_cache)
setattr(litellm.proxy.proxy_server, "master_key", "sk-1234")
setattr(litellm.proxy.proxy_server, "prisma_client", "hello-world")
request = Request(scope={"type": "http"})
request._url = URL(url="/chat/completions")
body = {"metadata": {"guardrails": {"hide_secrets": False}}}
json_bytes = json.dumps(body).encode("utf-8")
request._body = json_bytes
try:
await user_api_key_auth(request=request, api_key="Bearer " + user_key)
pytest.fail("Expected to raise 403 forbidden error.")
except ProxyException as e:
assert e.code == 403
from litellm.tests.test_custom_callback_input import CompletionCustomHandler

View file

@ -12,6 +12,8 @@ sys.path.insert(
import pytest
from litellm import get_secret
from litellm.proxy.secret_managers.aws_secret_manager import load_aws_secret_manager
from litellm.llms.azure import get_azure_ad_token_from_oidc
from litellm.llms.bedrock_httpx import BedrockLLM
@pytest.mark.skip(reason="AWS Suspended Account")
@ -60,7 +62,7 @@ def test_oidc_github():
)
def test_oidc_circleci():
secret_val = get_secret(
"oidc/circleci/https://bedrock-runtime.us-east-1.amazonaws.com/model/amazon.titan-text-express-v1/invoke"
"oidc/circleci/"
)
print(f"secret_val: {redact_oidc_signature(secret_val)}")
@ -76,3 +78,38 @@ def test_oidc_circleci_v2():
)
print(f"secret_val: {redact_oidc_signature(secret_val)}")
@pytest.mark.skipif(
os.environ.get("CIRCLE_OIDC_TOKEN") is None,
reason="Cannot run without being in CircleCI Runner",
)
def test_oidc_circleci_with_azure():
# TODO: Switch to our own Azure account, currently using ai.moda's account
os.environ["AZURE_TENANT_ID"] = "17c0a27a-1246-4aa1-a3b6-d294e80e783c"
os.environ["AZURE_CLIENT_ID"] = "4faf5422-b2bd-45e8-a6d7-46543a38acd0"
azure_ad_token = get_azure_ad_token_from_oidc("oidc/circleci/")
print(f"secret_val: {redact_oidc_signature(azure_ad_token)}")
@pytest.mark.skipif(
os.environ.get("CIRCLE_OIDC_TOKEN") is None,
reason="Cannot run without being in CircleCI Runner",
)
def test_oidc_circle_v1_with_amazon():
# The purpose of this test is to get logs using the older v1 of the CircleCI OIDC token
# TODO: This is using ai.moda's IAM role, we should use LiteLLM's IAM role eventually
aws_role_name = (
"arn:aws:iam::335785316107:role/litellm-github-unit-tests-circleci-v1-assume-only"
)
aws_web_identity_token = "oidc/circleci/"
bllm = BedrockLLM()
creds = bllm.get_credentials(
aws_region_name="ca-west-1",
aws_web_identity_token=aws_web_identity_token,
aws_role_name=aws_role_name,
aws_session_name="assume-v1-session",
)

View file

@ -1988,25 +1988,30 @@ async def test_hf_completion_tgi_stream():
# test on openai completion call
def test_openai_chat_completion_call():
try:
litellm.set_verbose = False
print(f"making openai chat completion call")
response = completion(model="gpt-3.5-turbo", messages=messages, stream=True)
complete_response = ""
start_time = time.time()
for idx, chunk in enumerate(response):
chunk, finished = streaming_format_tests(idx, chunk)
print(f"outside chunk: {chunk}")
if finished:
break
complete_response += chunk
# print(f'complete_chunk: {complete_response}')
if complete_response.strip() == "":
raise Exception("Empty response received")
print(f"complete response: {complete_response}")
except:
print(f"error occurred: {traceback.format_exc()}")
pass
litellm.set_verbose = False
litellm.return_response_headers = True
print(f"making openai chat completion call")
response = completion(model="gpt-3.5-turbo", messages=messages, stream=True)
assert isinstance(
response._hidden_params["additional_headers"][
"llm_provider-x-ratelimit-remaining-requests"
],
str,
)
print(f"response._hidden_params: {response._hidden_params}")
complete_response = ""
start_time = time.time()
for idx, chunk in enumerate(response):
chunk, finished = streaming_format_tests(idx, chunk)
print(f"outside chunk: {chunk}")
if finished:
break
complete_response += chunk
# print(f'complete_chunk: {complete_response}')
if complete_response.strip() == "":
raise Exception("Empty response received")
print(f"complete response: {complete_response}")
# test_openai_chat_completion_call()

View file

@ -1,4 +1,4 @@
from typing import Iterable, List, Optional, Union
from typing import Any, Dict, Iterable, List, Optional, Union
from pydantic import BaseModel, validator
from typing_extensions import Literal, Required, TypedDict
@ -113,6 +113,9 @@ class AnthropicMessagesRequest(TypedDict, total=False):
top_k: int
top_p: float
# litellm param - used for tracking litellm proxy metadata in the request
litellm_metadata: dict
class ContentTextBlockDelta(TypedDict):
"""

View file

@ -436,6 +436,7 @@ class ChatCompletionRequest(TypedDict, total=False):
function_call: Union[str, dict]
functions: List
user: str
metadata: dict # litellm specific param
class ChatCompletionDeltaChunk(TypedDict, total=False):

View file

@ -1029,3 +1029,22 @@ class GenericImageParsingChunk(TypedDict):
class ResponseFormatChunk(TypedDict, total=False):
type: Required[Literal["json_object", "text"]]
response_schema: dict
class LoggedLiteLLMParams(TypedDict, total=False):
force_timeout: Optional[float]
custom_llm_provider: Optional[str]
api_base: Optional[str]
litellm_call_id: Optional[str]
model_alias_map: Optional[dict]
metadata: Optional[dict]
model_info: Optional[dict]
proxy_server_request: Optional[dict]
acompletion: Optional[bool]
preset_cache_key: Optional[str]
no_log: Optional[bool]
input_cost_per_second: Optional[float]
input_cost_per_token: Optional[float]
output_cost_per_token: Optional[float]
output_cost_per_second: Optional[float]
cooldown_time: Optional[float]

View file

@ -129,6 +129,7 @@ from .exceptions import (
ServiceUnavailableError,
Timeout,
UnprocessableEntityError,
UnsupportedParamsError,
)
from .proxy._types import KeyManagementSystem
from .types.llms.openai import (
@ -158,6 +159,7 @@ from typing import (
Tuple,
Union,
cast,
get_args,
)
from .caching import Cache
@ -224,17 +226,6 @@ last_fetched_at_keys = None
# }
class UnsupportedParamsError(Exception):
def __init__(self, status_code, message):
self.status_code = status_code
self.message = message
self.request = httpx.Request(method="POST", url=" https://openai.api.com/v1/")
self.response = httpx.Response(status_code=status_code, request=self.request)
super().__init__(
self.message
) # Call the base class constructor with the parameters it needs
############################################################
def print_verbose(
print_statement,
@ -405,7 +396,6 @@ def function_setup(
# Pop the async items from input_callback in reverse order to avoid index issues
for index in reversed(removed_async_items):
litellm.input_callback.pop(index)
if len(litellm.success_callback) > 0:
removed_async_items = []
for index, callback in enumerate(litellm.success_callback): # type: ignore
@ -417,9 +407,9 @@ def function_setup(
# we only support async dynamo db logging for acompletion/aembedding since that's used on proxy
litellm._async_success_callback.append(callback)
removed_async_items.append(index)
elif callback == "langsmith":
elif callback in litellm._known_custom_logger_compatible_callbacks:
callback_class = litellm.litellm_core_utils.litellm_logging._init_custom_logger_compatible_class( # type: ignore
callback, internal_usage_cache=None, llm_router=None
callback, internal_usage_cache=None, llm_router=None # type: ignore
)
# don't double add a callback
@ -3088,6 +3078,15 @@ def get_optional_params(
non_default_params=non_default_params,
optional_params=optional_params,
)
elif custom_llm_provider == "vertex_ai" and model in litellm.vertex_llama3_models:
supported_params = get_supported_openai_params(
model=model, custom_llm_provider=custom_llm_provider
)
_check_valid_arg(supported_params=supported_params)
optional_params = litellm.VertexAILlama3Config().map_openai_params(
non_default_params=non_default_params,
optional_params=optional_params,
)
elif custom_llm_provider == "sagemaker":
## check if unsupported param passed in
supported_params = get_supported_openai_params(
@ -4189,6 +4188,9 @@ def get_supported_openai_params(
return litellm.GoogleAIStudioGeminiConfig().get_supported_openai_params()
elif custom_llm_provider == "vertex_ai":
if request_type == "chat_completion":
if model.startswith("meta/"):
return litellm.VertexAILlama3Config().get_supported_openai_params()
return litellm.VertexAIConfig().get_supported_openai_params()
elif request_type == "embeddings":
return litellm.VertexAITextEmbeddingConfig().get_supported_openai_params()
@ -4484,7 +4486,11 @@ def get_llm_provider(
or get_secret("TOGETHER_AI_TOKEN")
)
elif custom_llm_provider == "friendliai":
api_base = "https://inference.friendli.ai/v1"
api_base = (
api_base
or get_secret("FRIENDLI_API_BASE")
or "https://inference.friendli.ai/v1"
)
dynamic_api_key = (
api_key
or get_secret("FRIENDLIAI_API_KEY")
@ -5678,6 +5684,14 @@ def convert_to_model_response_object(
_response_headers: Optional[dict] = None,
):
received_args = locals()
if _response_headers is not None:
llm_response_headers = {
"{}-{}".format("llm_provider", k): v for k, v in _response_headers.items()
}
if hidden_params is not None:
hidden_params["additional_headers"] = llm_response_headers
else:
hidden_params = {"additional_headers": llm_response_headers}
### CHECK IF ERROR IN RESPONSE ### - openrouter returns these in the dictionary
if (
response_object is not None
@ -5744,10 +5758,12 @@ def convert_to_model_response_object(
model_response_object.usage.total_tokens = response_object["usage"].get("total_tokens", 0) # type: ignore
if "created" in response_object:
model_response_object.created = response_object["created"]
model_response_object.created = response_object["created"] or int(
time.time()
)
if "id" in response_object:
model_response_object.id = response_object["id"]
model_response_object.id = response_object["id"] or str(uuid.uuid4())
if "system_fingerprint" in response_object:
model_response_object.system_fingerprint = response_object[
@ -8312,8 +8328,13 @@ class CustomStreamWrapper:
or {}
)
self._hidden_params = {
"model_id": (_model_info.get("id", None))
"model_id": (_model_info.get("id", None)),
} # returned as x-litellm-model-id response header in proxy
if _response_headers is not None:
self._hidden_params["additional_headers"] = {
"{}-{}".format("llm_provider", k): v
for k, v in _response_headers.items()
}
self._response_headers = _response_headers
self.response_id = None
self.logging_loop = None
@ -8808,11 +8829,14 @@ class CustomStreamWrapper:
str_line.choices[0].content_filter_result
)
else:
error_message = "Azure Response={}".format(
str(dict(str_line))
error_message = "{} Response={}".format(
self.custom_llm_provider, str(dict(str_line))
)
raise litellm.AzureOpenAIError(
status_code=400, message=error_message
raise litellm.ContentPolicyViolationError(
message=error_message,
llm_provider=self.custom_llm_provider,
model=self.model,
)
# checking for logprobs
@ -9094,6 +9118,42 @@ class CustomStreamWrapper:
except Exception as e:
raise e
def handle_triton_stream(self, chunk):
try:
if isinstance(chunk, dict):
parsed_response = chunk
elif isinstance(chunk, (str, bytes)):
if isinstance(chunk, bytes):
chunk = chunk.decode("utf-8")
if "text_output" in chunk:
response = chunk.replace("data: ", "").strip()
parsed_response = json.loads(response)
else:
return {
"text": "",
"is_finished": False,
"prompt_tokens": 0,
"completion_tokens": 0,
}
else:
print_verbose(f"chunk: {chunk} (Type: {type(chunk)})")
raise ValueError(
f"Unable to parse response. Original response: {chunk}"
)
text = parsed_response.get("text_output", "")
finish_reason = parsed_response.get("stop_reason")
is_finished = parsed_response.get("is_finished", False)
return {
"text": text,
"is_finished": is_finished,
"finish_reason": finish_reason,
"prompt_tokens": parsed_response.get("input_token_count", 0),
"completion_tokens": parsed_response.get("generated_token_count", 0),
}
return {"text": "", "is_finished": False}
except Exception as e:
raise e
def handle_clarifai_completion_chunk(self, chunk):
try:
if isinstance(chunk, dict):
@ -9513,6 +9573,12 @@ class CustomStreamWrapper:
completion_obj["content"] = response_obj["text"]
if response_obj["is_finished"]:
self.received_finish_reason = response_obj["finish_reason"]
elif self.custom_llm_provider == "triton":
response_obj = self.handle_triton_stream(chunk)
completion_obj["content"] = response_obj["text"]
print_verbose(f"completion obj content: {completion_obj['content']}")
if response_obj["is_finished"]:
self.received_finish_reason = response_obj["finish_reason"]
elif self.custom_llm_provider == "text-completion-openai":
response_obj = self.handle_openai_text_completion_chunk(chunk)
completion_obj["content"] = response_obj["text"]
@ -10068,6 +10134,7 @@ class CustomStreamWrapper:
or self.custom_llm_provider == "predibase"
or self.custom_llm_provider == "databricks"
or self.custom_llm_provider == "bedrock"
or self.custom_llm_provider == "triton"
or self.custom_llm_provider == "watsonx"
or self.custom_llm_provider in litellm.openai_compatible_endpoints
):

View file

@ -760,6 +760,36 @@
"litellm_provider": "azure_ai",
"mode": "chat"
},
"azure_ai/Meta-Llama-31-8B-Instruct": {
"max_tokens": 128000,
"max_input_tokens": 128000,
"max_output_tokens": 128000,
"input_cost_per_token": 0.0000003,
"output_cost_per_token": 0.00000061,
"litellm_provider": "azure_ai",
"mode": "chat",
"source":"https://azuremarketplace.microsoft.com/en-us/marketplace/apps/metagenai.meta-llama-3-1-8b-instruct-offer?tab=PlansAndPrice"
},
"azure_ai/Meta-Llama-31-70B-Instruct": {
"max_tokens": 128000,
"max_input_tokens": 128000,
"max_output_tokens": 128000,
"input_cost_per_token": 0.00000268,
"output_cost_per_token": 0.00000354,
"litellm_provider": "azure_ai",
"mode": "chat",
"source":"https://azuremarketplace.microsoft.com/en-us/marketplace/apps/metagenai.meta-llama-3-1-70b-instruct-offer?tab=PlansAndPrice"
},
"azure_ai/Meta-Llama-31-405B-Instruct": {
"max_tokens": 128000,
"max_input_tokens": 128000,
"max_output_tokens": 128000,
"input_cost_per_token": 0.00000533,
"output_cost_per_token": 0.000016,
"litellm_provider": "azure_ai",
"mode": "chat",
"source":"https://azuremarketplace.microsoft.com/en-us/marketplace/apps/metagenai.meta-llama-3-1-405b-instruct-offer?tab=PlansAndPrice"
},
"babbage-002": {
"max_tokens": 16384,
"max_input_tokens": 16384,
@ -1948,6 +1978,16 @@
"supports_function_calling": true,
"supports_vision": true
},
"vertex_ai/meta/llama3-405b-instruct-maas": {
"max_tokens": 32000,
"max_input_tokens": 32000,
"max_output_tokens": 32000,
"input_cost_per_token": 0.0,
"output_cost_per_token": 0.0,
"litellm_provider": "vertex_ai-llama_models",
"mode": "chat",
"source": "https://cloud.google.com/vertex-ai/generative-ai/pricing#partner-models"
},
"vertex_ai/imagegeneration@006": {
"cost_per_image": 0.020,
"litellm_provider": "vertex_ai-image-models",
@ -3633,6 +3673,24 @@
"litellm_provider": "bedrock",
"mode": "chat"
},
"meta.llama3-1-8b-instruct-v1:0": {
"max_tokens": 128000,
"max_input_tokens": 128000,
"max_output_tokens": 2048,
"input_cost_per_token": 0.0000004,
"output_cost_per_token": 0.0000006,
"litellm_provider": "bedrock",
"mode": "chat"
},
"meta.llama3-1-70b-instruct-v1:0": {
"max_tokens": 128000,
"max_input_tokens": 128000,
"max_output_tokens": 2048,
"input_cost_per_token": 0.00000265,
"output_cost_per_token": 0.0000035,
"litellm_provider": "bedrock",
"mode": "chat"
},
"512-x-512/50-steps/stability.stable-diffusion-xl-v0": {
"max_tokens": 77,
"max_input_tokens": 77,

7
prometheus.yml Normal file
View file

@ -0,0 +1,7 @@
global:
scrape_interval: 15s
scrape_configs:
- job_name: 'litellm'
static_configs:
- targets: ['litellm:4000'] # Assuming Litellm exposes metrics at port 4000

View file

@ -1,6 +1,6 @@
[tool.poetry]
name = "litellm"
version = "1.41.26"
version = "1.42.0"
description = "Library to easily interface with LLM API providers"
authors = ["BerriAI"]
license = "MIT"
@ -91,7 +91,7 @@ requires = ["poetry-core", "wheel"]
build-backend = "poetry.core.masonry.api"
[tool.commitizen]
version = "1.41.26"
version = "1.42.0"
version_files = [
"pyproject.toml:^version"
]

View file

@ -172,7 +172,7 @@ model LiteLLM_Config {
model LiteLLM_SpendLogs {
request_id String @id
call_type String
api_key String @default ("")
api_key String @default ("") // Hashed API Token. Not the actual Virtual Key. Equivalent to 'token' column in LiteLLM_VerificationToken
spend Float @default(0.0)
total_tokens Int @default(0)
prompt_tokens Int @default(0)
@ -183,12 +183,12 @@ model LiteLLM_SpendLogs {
model String @default("")
model_id String? @default("") // the model id stored in proxy model db
model_group String? @default("") // public model_name / model_group
api_base String @default("")
user String @default("")
metadata Json @default("{}")
cache_hit String @default("")
cache_key String @default("")
request_tags Json @default("[]")
api_base String? @default("")
user String? @default("")
metadata Json? @default("{}")
cache_hit String? @default("")
cache_key String? @default("")
request_tags Json? @default("[]")
team_id String?
end_user String?
requester_ip_address String?