Merge branch 'main' into main

This commit is contained in:
Vincelwt 2024-03-30 13:21:53 +09:00 committed by GitHub
commit 1b84dfac91
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
301 changed files with 62646 additions and 3691 deletions

View file

@ -28,8 +28,9 @@ jobs:
pip install "pytest==7.3.1"
pip install "pytest-asyncio==0.21.1"
pip install mypy
pip install "google-generativeai>=0.3.2"
pip install "google-cloud-aiplatform>=1.38.0"
pip install "google-generativeai==0.3.2"
pip install "google-cloud-aiplatform==1.43.0"
pip install pyarrow
pip install "boto3>=1.28.57"
pip install "aioboto3>=12.3.0"
pip install langchain
@ -48,6 +49,7 @@ jobs:
pip install argon2-cffi
pip install "pytest-mock==3.12.0"
pip install python-multipart
pip install google-cloud-aiplatform
- save_cache:
paths:
- ./venv
@ -152,10 +154,11 @@ jobs:
pip install "pytest-mock==3.12.0"
pip install "pytest-asyncio==0.21.1"
pip install mypy
pip install "google-generativeai>=0.3.2"
pip install "google-cloud-aiplatform>=1.38.0"
pip install "boto3>=1.28.57"
pip install "aioboto3>=12.3.0"
pip install "google-generativeai==0.3.2"
pip install "google-cloud-aiplatform==1.43.0"
pip install pyarrow
pip install "boto3==1.34.34"
pip install "aioboto3==12.3.0"
pip install langchain
pip install "langfuse>=2.0.0"
pip install numpydoc

View file

@ -7,8 +7,7 @@ baseten
cohere
redis
anthropic
boto3
orjson
pydantic
google-cloud-aiplatform
google-cloud-aiplatform==1.43.0
redisvl==0.0.7 # semantic caching

View file

@ -43,6 +43,13 @@ jobs:
push: true
file: Dockerfile.database
tags: litellm/litellm-database:${{ github.event.inputs.tag || 'latest' }}
-
name: Build and push litellm-spend-logs image
uses: docker/build-push-action@v5
with:
push: true
file: ./litellm-js/spend-logs/Dockerfile
tags: litellm/litellm-spend_logs:${{ github.event.inputs.tag || 'latest' }}
build-and-push-image:
runs-on: ubuntu-latest
@ -120,6 +127,44 @@ jobs:
tags: ${{ steps.meta-database.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta-database.outputs.tags }}-latest
labels: ${{ steps.meta-database.outputs.labels }}
platforms: local,linux/amd64,linux/arm64,linux/arm64/v8
build-and-push-image-spend-logs:
runs-on: ubuntu-latest
permissions:
contents: read
packages: write
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Log in to the Container registry
uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Extract metadata (tags, labels) for spend-logs Dockerfile
id: meta-spend-logs
uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7
with:
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}-spend_logs
# Configure multi platform Docker builds
- name: Set up QEMU
uses: docker/setup-qemu-action@e0e4588fad221d38ee467c0bffd91115366dc0c5
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@edfb0fe6204400c56fbfd3feba3fe9ad1adfa345
- name: Build and push Database Docker image
uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4
with:
context: .
file: ./litellm-js/spend-logs/Dockerfile
push: true
tags: ${{ steps.meta-spend-logs.outputs.tags }}-${{ github.event.inputs.tag || 'latest' }}, ${{ steps.meta-spend-logs.outputs.tags }}-latest
labels: ${{ steps.meta-spend-logs.outputs.labels }}
platforms: local,linux/amd64,linux/arm64,linux/arm64/v8
build-and-push-helm-chart:
runs-on: ubuntu-latest
steps:

View file

@ -1,8 +1,8 @@
# Base image for building
ARG LITELLM_BUILD_IMAGE=python:3.9
ARG LITELLM_BUILD_IMAGE=python:3.11.8-slim
# Runtime image
ARG LITELLM_RUNTIME_IMAGE=python:3.9-slim
ARG LITELLM_RUNTIME_IMAGE=python:3.11.8-slim
# Builder stage
FROM $LITELLM_BUILD_IMAGE as builder
@ -70,5 +70,5 @@ EXPOSE 4000/tcp
ENTRYPOINT ["litellm"]
# Append "--detailed_debug" to the end of CMD to view detailed debug logs
# CMD ["--port", "4000", "--config", "./proxy_server_config.yaml", "--run_gunicorn", "--detailed_debug"]
CMD ["--port", "4000", "--config", "./proxy_server_config.yaml", "--run_gunicorn", "--num_workers", "4"]
# CMD ["--port", "4000", "--config", "./proxy_server_config.yaml"]
CMD ["--port", "4000", "--config", "./proxy_server_config.yaml"]

View file

@ -1,8 +1,8 @@
# Base image for building
ARG LITELLM_BUILD_IMAGE=python:3.9
ARG LITELLM_BUILD_IMAGE=python:3.11.8-slim
# Runtime image
ARG LITELLM_RUNTIME_IMAGE=python:3.9-slim
ARG LITELLM_RUNTIME_IMAGE=python:3.11.8-slim
# Builder stage
FROM $LITELLM_BUILD_IMAGE as builder
@ -72,5 +72,5 @@ EXPOSE 4000/tcp
ENTRYPOINT ["litellm"]
# Append "--detailed_debug" to the end of CMD to view detailed debug logs
# CMD ["--port", "4000","--run_gunicorn", "--detailed_debug"]
CMD ["--port", "4000", "--run_gunicorn"]
# CMD ["--port", "4000", "--detailed_debug"]
CMD ["--port", "4000"]

View file

@ -31,11 +31,11 @@ LiteLLM manages:
- Retry/fallback logic across multiple deployments (e.g. Azure/OpenAI) - [Router](https://docs.litellm.ai/docs/routing)
- Set Budgets & Rate limits per project, api key, model [OpenAI Proxy Server](https://docs.litellm.ai/docs/simple_proxy)
**Stable Release**: v`1.30.2` 👈 Recommended stable version of proxy.
[**Jump to OpenAI Proxy Docs**](https://github.com/BerriAI/litellm?tab=readme-ov-file#openai-proxy---docs) <br>
[**Jump to Supported LLM Providers**](https://github.com/BerriAI/litellm?tab=readme-ov-file#supported-provider-docs)
🚨 **Stable Release:** v1.34.1
Support for more providers. Missing a provider or LLM Platform, raise a [feature request](https://github.com/BerriAI/litellm/issues/new?assignees=&labels=enhancement&projects=&template=feature_request.yml&title=%5BFeature%5D%3A+).
# Usage ([**Docs**](https://docs.litellm.ai/docs/))

View file

@ -0,0 +1,55 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: litellm-deployment
spec:
replicas: 3
selector:
matchLabels:
app: litellm
template:
metadata:
labels:
app: litellm
spec:
containers:
- name: litellm-container
image: ghcr.io/berriai/litellm:main-latest
env:
- name: AZURE_API_KEY
value: "d6f****"
- name: AZURE_API_BASE
value: "https://openai
- name: LITELLM_MASTER_KEY
value: "sk-1234"
- name: DATABASE_URL
value: "postgresql://ishaan:*********""
args:
- "--config"
- "/app/proxy_config.yaml" # Update the path to mount the config file
volumeMounts: # Define volume mount for proxy_config.yaml
- name: config-volume
mountPath: /app
readOnly: true
livenessProbe:
httpGet:
path: /health/liveliness
port: 4000
initialDelaySeconds: 120
periodSeconds: 15
successThreshold: 1
failureThreshold: 3
timeoutSeconds: 10
readinessProbe:
httpGet:
path: /health/readiness
port: 4000
initialDelaySeconds: 120
periodSeconds: 15
successThreshold: 1
failureThreshold: 3
timeoutSeconds: 10
volumes: # Define volume to mount proxy_config.yaml
- name: config-volume
configMap:
name: litellm-config

View file

@ -0,0 +1,12 @@
apiVersion: v1
kind: Service
metadata:
name: litellm-service
spec:
selector:
app: litellm
ports:
- protocol: TCP
port: 4000
targetPort: 4000
type: LoadBalancer

View file

@ -76,7 +76,6 @@ Click on your personal dashboard link. Here's how you can find it 👇
Oh! Looks like our request was made successfully. Let's click on it and see exactly what got sent to the LLM provider.
<Image img={require('../../img/dashboard_log_row.png')} alt="Dashboard Log Row" />

View file

@ -41,6 +41,35 @@ response = completion(
)
```
## Additional information in metadata
You can send some additional information to Athina by using the `metadata` field in completion. This can be useful for sending metadata about the request, such as the customer_id, prompt_slug, or any other information you want to track.
```python
#openai call with additional metadata
response = completion(
model="gpt-3.5-turbo",
messages=[
{"role": "user", "content": "Hi 👋 - i'm openai"}
],
metadata={
"environment": "staging",
"prompt_slug": "my_prompt_slug/v1"
}
)
```
Following are the allowed fields in metadata, their types, and their descriptions:
* `environment: Optional[str]` - Environment your app is running in (ex: production, staging, etc). This is useful for segmenting inference calls by environment.
* `prompt_slug: Optional[str]` - Identifier for the prompt used for inference. This is useful for segmenting inference calls by prompt.
* `customer_id: Optional[str]` - This is your customer ID. This is useful for segmenting inference calls by customer.
* `customer_user_id: Optional[str]` - This is the end user ID. This is useful for segmenting inference calls by the end user.
* `session_id: Optional[str]` - is the session or conversation ID. This is used for grouping different inferences into a conversation or chain. [Read more].(https://docs.athina.ai/logging/grouping_inferences)
* `external_reference_id: Optional[str]` - This is useful if you want to associate your own internal identifier with the inference logged to Athina.
* `context: Optional[Union[dict, str]]` - This is the context used as information for the prompt. For RAG applications, this is the "retrieved" data. You may log context as a string or as an object (dictionary).
* `expected_response: Optional[str]` - This is the reference response to compare against for evaluation purposes. This is useful for segmenting inference calls by expected response.
* `user_query: Optional[str]` - This is the user's query. For conversational applications, this is the user's last message.
## Support & Talk with Athina Team
- [Schedule Demo 👋](https://cal.com/shiv-athina/30min)

View file

@ -60,11 +60,30 @@ export ANTHROPIC_API_KEY="your-api-key"
### 2. Start the proxy
<Tabs>
<TabItem value="cli" label="cli">
```bash
$ litellm --model claude-3-opus-20240229
# Server running on http://0.0.0.0:4000
```
</TabItem>
<TabItem value="config" label="config.yaml">
```yaml
model_list:
- model_name: claude-3 ### RECEIVED MODEL NAME ###
litellm_params: # all params accepted by litellm.completion() - https://docs.litellm.ai/docs/completion/input
model: claude-3-opus-20240229 ### MODEL NAME sent to `litellm.completion()` ###
api_key: "os.environ/ANTHROPIC_API_KEY" # does os.getenv("AZURE_API_KEY_EU")
```
```bash
litellm --config /path/to/config.yaml
```
</TabItem>
</Tabs>
### 3. Test it
@ -76,7 +95,7 @@ $ litellm --model claude-3-opus-20240229
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data ' {
"model": "gpt-3.5-turbo",
"model": "claude-3",
"messages": [
{
"role": "user",
@ -97,7 +116,7 @@ client = openai.OpenAI(
)
# request sent to model set on litellm proxy, `litellm --model`
response = client.chat.completions.create(model="gpt-3.5-turbo", messages = [
response = client.chat.completions.create(model="claude-3", messages = [
{
"role": "user",
"content": "this is a test request, write a short poem"
@ -121,7 +140,7 @@ from langchain.schema import HumanMessage, SystemMessage
chat = ChatOpenAI(
openai_api_base="http://0.0.0.0:4000", # set openai_api_base to the LiteLLM Proxy
model = "gpt-3.5-turbo",
model = "claude-3",
temperature=0.1
)
@ -238,7 +257,7 @@ resp = litellm.completion(
print(f"\nResponse: {resp}")
```
### Usage - "Assistant Pre-fill"
## Usage - "Assistant Pre-fill"
You can "put words in Claude's mouth" by including an `assistant` role message as the last item in the `messages` array.
@ -271,8 +290,8 @@ Human: How do you say 'Hello' in German? Return your answer as a JSON object, li
Assistant: {
```
### Usage - "System" messages
If you're using Anthropic's Claude 2.1 with Bedrock, `system` role messages are properly formatted for you.
## Usage - "System" messages
If you're using Anthropic's Claude 2.1, `system` role messages are properly formatted for you.
```python
import os

View file

@ -20,7 +20,28 @@ os.environ["AWS_SECRET_ACCESS_KEY"] = ""
os.environ["AWS_REGION_NAME"] = ""
response = completion(
model="sagemaker/jumpstart-dft-meta-textgeneration-llama-2-7b",
model="sagemaker/<your-endpoint-name>",
messages=[{ "content": "Hello, how are you?","role": "user"}],
temperature=0.2,
max_tokens=80
)
```
### Passing Inference Component Name
If you have multiple models on an endpoint, you'll need to specify the individual model names, do this via `model_id`.
```python
import os
from litellm import completion
os.environ["AWS_ACCESS_KEY_ID"] = ""
os.environ["AWS_SECRET_ACCESS_KEY"] = ""
os.environ["AWS_REGION_NAME"] = ""
response = completion(
model="sagemaker/<your-endpoint-name>",
model_id="<your-model-name",
messages=[{ "content": "Hello, how are you?","role": "user"}],
temperature=0.2,
max_tokens=80

View file

@ -2,6 +2,7 @@
## Pre-requisites
* `pip install -q google-generativeai`
* Get API Key - https://aistudio.google.com/
# Gemini-Pro
## Sample Usage
@ -97,6 +98,6 @@ print(content)
| Model Name | Function Call | Required OS Variables |
|------------------|--------------------------------------|-------------------------|
| gemini-pro | `completion('gemini/gemini-pro', messages)` | `os.environ['GEMINI_API_KEY']` |
| gemini-1.5-pro | `completion('gemini/gemini-pro', messages)` | `os.environ['GEMINI_API_KEY']` |
| gemini-1.5-pro | `completion('gemini/gemini-1.5-pro', messages)` | `os.environ['GEMINI_API_KEY']` |
| gemini-pro-vision | `completion('gemini/gemini-pro-vision', messages)` | `os.environ['GEMINI_API_KEY']` |
| gemini-1.5-pro-vision | `completion('gemini/gemini-pro-vision', messages)` | `os.environ['GEMINI_API_KEY']` |
| gemini-1.5-pro-vision | `completion('gemini/gemini-1.5-pro-vision', messages)` | `os.environ['GEMINI_API_KEY']` |

View file

@ -1,61 +0,0 @@
import Image from '@theme/IdealImage';
# 🚨 Budget Alerting
**Alerts when a project will exceed its planned limit**
<Image img={require('../../img/budget_alerts.png')} />
## Quick Start
### 1. Setup Slack Alerting on your Proxy Config.yaml
**Add Slack Webhook to your env**
Get a slack webhook url from https://api.slack.com/messaging/webhooks
Set `SLACK_WEBHOOK_URL` in your proxy env
```shell
export SLACK_WEBHOOK_URL="https://hooks.slack.com/services/<>/<>/<>"
```
**Update proxy config.yaml with slack alerting**
Add `general_settings:alerting`
```yaml
model_list:
model_name: "azure-model"
litellm_params:
model: "azure/gpt-35-turbo"
general_settings:
alerting: ["slack"]
```
Start proxy
```bash
$ litellm --config /path/to/config.yaml
```
### 2. Create API Key on Proxy Admin UI
The Admin UI is found on `your-litellm-proxy-endpoint/ui`, example `http://localhost:4000/ui/`
- Set a key name
- Set a Soft Budget on when to get alerted
<Image img={require('../../img/create_key.png')} />
### 3. Test Slack Alerting on Admin UI
After creating a key on the Admin UI, click on "Test Slack Alert" to send a test alert to your Slack channel
<Image img={require('../../img/test_alert.png')} />
### 4. Check Slack
When the test alert works, you should expect to see this on your alerts slack channel
<Image img={require('../../img/budget_alerts.png')} />

View file

@ -32,8 +32,9 @@ litellm_settings:
cache: True # set cache responses to True, litellm defaults to using a redis cache
```
#### [OPTIONAL] Step 1.5: Add redis namespaces
#### [OPTIONAL] Step 1.5: Add redis namespaces, default ttl
## Namespace
If you want to create some folder for your keys, you can set a namespace, like this:
```yaml
@ -50,6 +51,16 @@ and keys will be stored like:
litellm_caching:<hash>
```
## TTL
```yaml
litellm_settings:
cache: true
cache_params: # set cache params for redis
type: redis
ttl: 600 # will be cached on redis for 600s
```
#### Step 2: Add Redis Credentials to .env
Set either `REDIS_URL` or the `REDIS_HOST` in your os environment, to enable caching.

View file

@ -1,7 +1,10 @@
import Image from '@theme/IdealImage';
# Modify / Reject Incoming Requests
- Modify data before making llm api calls on proxy
- Reject data before making llm api calls / before returning the response
- Enforce 'user' param for all openai endpoint calls
See a complete example with our [parallel request rate limiter](https://github.com/BerriAI/litellm/blob/main/litellm/proxy/hooks/parallel_request_limiter.py)
@ -95,7 +98,7 @@ We might need to update the function schema in the future, to support multiple e
:::
See a complete example with our [Llama Guard content moderation hook](https://github.com/BerriAI/litellm/blob/main/enterprise/hooks/llama_guard.py)
See a complete example with our [Llama Guard content moderation hook](https://github.com/BerriAI/litellm/blob/main/enterprise/enterprise_hooks/llm_guard.py)
```python
from litellm.integrations.custom_logger import CustomLogger
@ -172,4 +175,19 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
}
],
}'
```
```
## Advanced - Enforce 'user' param
Set `enforce_user_param` to true, to require all calls to the openai endpoints to have the 'user' param.
[**See Code**](https://github.com/BerriAI/litellm/blob/4777921a31c4c70e4d87b927cb233b6a09cd8b51/litellm/proxy/auth/auth_checks.py#L72)
```yaml
general_settings:
enforce_user_param: True
```
**Result**
<Image img={require('../../img/end_user_enforcement.png')}/>

View file

@ -62,7 +62,6 @@ model_list:
litellm_settings: # module level litellm settings - https://github.com/BerriAI/litellm/blob/main/litellm/__init__.py
drop_params: True
set_verbose: True
general_settings:
master_key: sk-1234 # [OPTIONAL] Only use this if you to require all calls to contain this key (Authorization: Bearer sk-1234)
@ -558,6 +557,16 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
}'
```
## Disable Swagger UI
To disable the Swagger docs from the base url, set
```env
NO_DOCS="True"
```
in your environment, and restart the proxy.
## Configure DB Pool Limits + Connection Timeouts
@ -592,7 +601,9 @@ general_settings:
"completion_model": "string",
"disable_spend_logs": "boolean", # turn off writing each transaction to the db
"disable_reset_budget": "boolean", # turn off reset budget scheduled task
"enable_jwt_auth": "boolean", # allow proxy admin to auth in via jwt tokens with 'litellm_proxy_admin' in claims
"enable_jwt_auth": "boolean", # allow proxy admin to auth in via jwt tokens with 'litellm_proxy_admin' in claims
"enforce_user_param": "boolean", # requires all openai endpoint requests to have a 'user' param
"allowed_routes": "list", # list of allowed proxy API routes - a user can access. (currently JWT-Auth only)
"key_management_system": "google_kms", # either google_kms or azure_kms
"master_key": "string",
"database_url": "string",

View file

@ -103,7 +103,10 @@ RUN chmod +x entrypoint.sh
EXPOSE 4000/tcp
# Override the CMD instruction with your desired command and arguments
CMD ["--port", "4000", "--config", "config.yaml", "--detailed_debug", "--run_gunicorn"]
# WARNING: FOR PROD DO NOT USE `--detailed_debug` it slows down response times, instead use the following CMD
# CMD ["--port", "4000", "--config", "config.yaml"]
CMD ["--port", "4000", "--config", "config.yaml", "--detailed_debug"]
```
</TabItem>
@ -232,7 +235,6 @@ Your OpenAI proxy server is now running on `http://127.0.0.1:4000`.
| [LiteLLM container + Redis](#litellm-container--redis) | + load balance across multiple litellm containers |
| [LiteLLM Database container + PostgresDB + Redis](#litellm-database-container--postgresdb--redis) | + use Virtual Keys + Track Spend + load balance across multiple litellm containers |
## Deploy with Database
### Docker, Kubernetes, Helm Chart
@ -474,25 +476,6 @@ docker run --name litellm-proxy \
ghcr.io/berriai/litellm-database:main-latest --config your_config.yaml
```
## Best Practices for Deploying to Production
### 1. Switch of debug logs in production
don't use [`--detailed-debug`, `--debug`](https://docs.litellm.ai/docs/proxy/debugging#detailed-debug) or `litellm.set_verbose=True`. We found using debug logs can add 5-10% latency per LLM API call
### 2. Use `run_gunicorn` and `num_workers`
Example setting `--run_gunicorn` and `--num_workers`
```shell
docker run ghcr.io/berriai/litellm-database:main-latest --run_gunicorn --num_workers 4
```
Why `Gunicorn`?
- Gunicorn takes care of running multiple instances of your web application
- Gunicorn is ideal for running litellm proxy on cluster of machines with Kubernetes
Why `num_workers`?
Setting `num_workers` to the number of CPUs available ensures optimal utilization of system resources by matching the number of worker processes to the available CPU cores.
## Advanced Deployment Settings
### Customization of the server root path
@ -525,6 +508,57 @@ Provide an ssl certificate when starting litellm proxy server
## Platform-specific Guide
<Tabs>
<TabItem value="AWS EKS" label="AWS EKS - Kubernetes">
### Kubernetes - Deploy on EKS
Step1. Create an EKS Cluster with the following spec
```shell
eksctl create cluster --name=litellm-cluster --region=us-west-2 --node-type=t2.small
```
Step 2. Mount litellm proxy config on kub cluster
This will mount your local file called `proxy_config.yaml` on kubernetes cluster
```shell
kubectl create configmap litellm-config --from-file=proxy_config.yaml
```
Step 3. Apply `kub.yaml` and `service.yaml`
Clone the following `kub.yaml` and `service.yaml` files and apply locally
- Use this `kub.yaml` file - [litellm kub.yaml](https://github.com/BerriAI/litellm/blob/main/deploy/kubernetes/kub.yaml)
- Use this `service.yaml` file - [litellm service.yaml](https://github.com/BerriAI/litellm/blob/main/deploy/kubernetes/service.yaml)
Apply `kub.yaml`
```
kubectl apply -f kub.yaml
```
Apply `service.yaml` - creates an AWS load balancer to expose the proxy
```
kubectl apply -f service.yaml
# service/litellm-service created
```
Step 4. Get Proxy Base URL
```shell
kubectl get services
# litellm-service LoadBalancer 10.100.6.31 a472dc7c273fd47fd******.us-west-2.elb.amazonaws.com 4000:30374/TCP 63m
```
Proxy Base URL = `a472dc7c273fd47fd******.us-west-2.elb.amazonaws.com:4000`
That's it, now you can start using LiteLLM Proxy
</TabItem>
<TabItem value="aws-stack" label="AWS Cloud Formation Stack">

View file

@ -12,9 +12,9 @@ Features here are behind a commercial license in our `/enterprise` folder. [**Se
:::
Features:
- ✅ Content Moderation with LLM Guard
- ✅ Content Moderation with LlamaGuard
- ✅ Content Moderation with Google Text Moderations
- ✅ Content Moderation with LLM Guard
- ✅ Reject calls from Blocked User list
- ✅ Reject calls (incoming / outgoing) with Banned Keywords (e.g. competitors)
- ✅ Don't log/store specific requests (eg confidential LLM requests)
@ -23,6 +23,71 @@ Features:
## Content Moderation
### Content Moderation with LLM Guard
Set the LLM Guard API Base in your environment
```env
LLM_GUARD_API_BASE = "http://0.0.0.0:8192" # deployed llm guard api
```
Add `llmguard_moderations` as a callback
```yaml
litellm_settings:
callbacks: ["llmguard_moderations"]
```
Now you can easily test it
- Make a regular /chat/completion call
- Check your proxy logs for any statement with `LLM Guard:`
Expected results:
```
LLM Guard: Received response - {"sanitized_prompt": "hello world", "is_valid": true, "scanners": { "Regex": 0.0 }}
```
#### Turn on/off per key
**1. Update config**
```yaml
litellm_settings:
callbacks: ["llmguard_moderations"]
llm_guard_mode: "key-specific"
```
**2. Create new key**
```bash
curl --location 'http://localhost:4000/key/generate' \
--header 'Authorization: Bearer sk-1234' \
--header 'Content-Type: application/json' \
--data '{
"models": ["fake-openai-endpoint"],
"permissions": {
"enable_llm_guard_check": true # 👈 KEY CHANGE
}
}'
# Returns {..'key': 'my-new-key'}
```
**2. Test it!**
```bash
curl --location 'http://0.0.0.0:4000/v1/chat/completions' \
--header 'Content-Type: application/json' \
--header 'Authorization: Bearer my-new-key' \ # 👈 TEST KEY
--data '{"model": "fake-openai-endpoint", "messages": [
{"role": "system", "content": "Be helpful"},
{"role": "user", "content": "What do you know?"}
]
}'
```
### Content Moderation with LlamaGuard
Currently works with Sagemaker's LlamaGuard endpoint.
@ -55,32 +120,7 @@ callbacks: ["llamaguard_moderations"]
llamaguard_unsafe_content_categories: /path/to/llamaguard_prompt.txt
```
### Content Moderation with LLM Guard
Set the LLM Guard API Base in your environment
```env
LLM_GUARD_API_BASE = "http://0.0.0.0:4000"
```
Add `llmguard_moderations` as a callback
```yaml
litellm_settings:
callbacks: ["llmguard_moderations"]
```
Now you can easily test it
- Make a regular /chat/completion call
- Check your proxy logs for any statement with `LLM Guard:`
Expected results:
```
LLM Guard: Received response - {"sanitized_prompt": "hello world", "is_valid": true, "scanners": { "Regex": 0.0 }}
```
### Content Moderation with Google Text Moderation

View file

@ -0,0 +1,53 @@
# Grafana, Prometheus metrics [BETA]
LiteLLM Exposes a `/metrics` endpoint for Prometheus to Poll
## Quick Start
If you're using the LiteLLM CLI with `litellm --config proxy_config.yaml` then you need to `pip install prometheus_client==0.20.0`. **This is already pre-installed on the litellm Docker image**
Add this to your proxy config.yaml
```yaml
model_list:
- model_name: gpt-3.5-turbo
litellm_params:
model: gpt-3.5-turbo
litellm_settings:
success_callback: ["prometheus"]
```
Start the proxy
```shell
litellm --config config.yaml --debug
```
Test Request
```shell
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--data '{
"model": "gpt-3.5-turbo",
"messages": [
{
"role": "user",
"content": "what llm are you"
}
]
}'
```
View Metrics on `/metrics`, Visit `http://localhost:4000/metrics`
```shell
http://localhost:4000/metrics
# <proxy_base_url>/metrics
```
## Metrics Tracked
| Metric Name | Description |
|----------------------|--------------------------------------|
| `litellm_requests_metric` | Number of requests made, per `"user", "key", "model"` |
| `litellm_spend_metric` | Total Spend, per `"user", "key", "model"` |
| `litellm_total_tokens` | input + output tokens per `"user", "key", "model"` |

View file

@ -0,0 +1,249 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# ⚡ Best Practices for Production
Expected Performance in Production
1 LiteLLM Uvicorn Worker on Kubernetes
| Description | Value |
|--------------|-------|
| Avg latency | `50ms` |
| Median latency | `51ms` |
| `/chat/completions` Requests/second | `35` |
| `/chat/completions` Requests/minute | `2100` |
| `/chat/completions` Requests/hour | `126K` |
## 1. Switch of Debug Logging
Remove `set_verbose: True` from your config.yaml
```yaml
litellm_settings:
set_verbose: True
```
You should only see the following level of details in logs on the proxy server
```shell
# INFO: 192.168.2.205:11774 - "POST /chat/completions HTTP/1.1" 200 OK
# INFO: 192.168.2.205:34717 - "POST /chat/completions HTTP/1.1" 200 OK
# INFO: 192.168.2.205:29734 - "POST /chat/completions HTTP/1.1" 200 OK
```
## 2. On Kubernetes - Use 1 Uvicorn worker [Suggested CMD]
Use this Docker `CMD`. This will start the proxy with 1 Uvicorn Async Worker
(Ensure that you're not setting `run_gunicorn` or `num_workers` in the CMD).
```shell
CMD ["--port", "4000", "--config", "./proxy_server_config.yaml"]
```
## 2. Batch write spend updates every 60s
The default proxy batch write is 10s. This is to make it easy to see spend when debugging locally.
In production, we recommend using a longer interval period of 60s. This reduces the number of connections used to make DB writes.
```yaml
general_settings:
master_key: sk-1234
proxy_batch_write_at: 5 # 👈 Frequency of batch writing logs to server (in seconds)
```
## 3. Move spend logs to separate server
Writing each spend log to the db can slow down your proxy. In testing we saw a 70% improvement in median response time, by moving writing spend logs to a separate server.
👉 [LiteLLM Spend Logs Server](https://github.com/BerriAI/litellm/tree/main/litellm-js/spend-logs)
**Spend Logs**
This is a log of the key, tokens, model, and latency for each call on the proxy.
[**Full Payload**](https://github.com/BerriAI/litellm/blob/8c9623a6bc4ad9da0a2dac64249a60ed8da719e8/litellm/proxy/utils.py#L1769)
**1. Start the spend logs server**
```bash
docker run -p 3000:3000 \
-e DATABASE_URL="postgres://.." \
ghcr.io/berriai/litellm-spend_logs:main-latest
# RUNNING on http://0.0.0.0:3000
```
**2. Connect to proxy**
Example litellm_config.yaml
```yaml
model_list:
- model_name: fake-openai-endpoint
litellm_params:
model: openai/my-fake-model
api_key: my-fake-key
api_base: https://exampleopenaiendpoint-production.up.railway.app/
general_settings:
master_key: sk-1234
proxy_batch_write_at: 5 # 👈 Frequency of batch writing logs to server (in seconds)
```
Add `SPEND_LOGS_URL` as an environment variable when starting the proxy
```bash
docker run \
-v $(pwd)/litellm_config.yaml:/app/config.yaml \
-e DATABASE_URL="postgresql://.." \
-e SPEND_LOGS_URL="http://host.docker.internal:3000" \ # 👈 KEY CHANGE
-p 4000:4000 \
ghcr.io/berriai/litellm:main-latest \
--config /app/config.yaml --detailed_debug
# Running on http://0.0.0.0:4000
```
**3. Test Proxy!**
```bash
curl --location 'http://0.0.0.0:4000/v1/chat/completions' \
--header 'Content-Type: application/json' \
--header 'Authorization: Bearer sk-1234' \
--data '{
"model": "fake-openai-endpoint",
"messages": [
{"role": "system", "content": "Be helpful"},
{"role": "user", "content": "What do you know?"}
]
}'
```
In your LiteLLM Spend Logs Server, you should see
**Expected Response**
```
Received and stored 1 logs. Total logs in memory: 1
...
Flushed 1 log to the DB.
```
### Machine Specification
A t2.micro should be sufficient to handle 1k logs / minute on this server.
This consumes at max 120MB, and <0.1 vCPU.
## 4. Switch off resetting budgets
Add this to your config.yaml. (Only spend per Key, User and Team will be tracked - spend per API Call will not be written to the LiteLLM Database)
```yaml
general_settings:
disable_spend_logs: true
disable_reset_budget: true
```
## 5. Switch of `litellm.telemetry`
Switch of all telemetry tracking done by litellm
```yaml
litellm_settings:
telemetry: False
```
## Machine Specifications to Deploy LiteLLM
| Service | Spec | CPUs | Memory | Architecture | Version|
| --- | --- | --- | --- | --- | --- |
| Server | `t2.small`. | `1vCPUs` | `8GB` | `x86` |
| Redis Cache | - | - | - | - | 7.0+ Redis Engine|
## Reference Kubernetes Deployment YAML
Reference Kubernetes `deployment.yaml` that was load tested by us
```yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: litellm-deployment
spec:
replicas: 3
selector:
matchLabels:
app: litellm
template:
metadata:
labels:
app: litellm
spec:
containers:
- name: litellm-container
image: ghcr.io/berriai/litellm:main-latest
imagePullPolicy: Always
env:
- name: AZURE_API_KEY
value: "d6******"
- name: AZURE_API_BASE
value: "https://ope******"
- name: LITELLM_MASTER_KEY
value: "sk-1234"
- name: DATABASE_URL
value: "po**********"
args:
- "--config"
- "/app/proxy_config.yaml" # Update the path to mount the config file
volumeMounts: # Define volume mount for proxy_config.yaml
- name: config-volume
mountPath: /app
readOnly: true
livenessProbe:
httpGet:
path: /health/liveliness
port: 4000
initialDelaySeconds: 120
periodSeconds: 15
successThreshold: 1
failureThreshold: 3
timeoutSeconds: 10
readinessProbe:
httpGet:
path: /health/readiness
port: 4000
initialDelaySeconds: 120
periodSeconds: 15
successThreshold: 1
failureThreshold: 3
timeoutSeconds: 10
volumes: # Define volume to mount proxy_config.yaml
- name: config-volume
configMap:
name: litellm-config
```
Reference Kubernetes `service.yaml` that was load tested by us
```yaml
apiVersion: v1
kind: Service
metadata:
name: litellm-service
spec:
selector:
app: litellm
ports:
- protocol: TCP
port: 4000
targetPort: 4000
type: LoadBalancer
```

View file

@ -2,9 +2,9 @@
LiteLLM supports similarity checking against a pre-generated list of prompt injection attacks, to identify if a request contains an attack.
[**See Code**](https://github.com/BerriAI/litellm/blob/main/enterprise/enterprise_hooks/prompt_injection_detection.py)
[**See Code**](https://github.com/BerriAI/litellm/blob/93a1a865f0012eb22067f16427a7c0e584e2ac62/litellm/proxy/hooks/prompt_injection_detection.py#L4)
### Usage
## Usage
1. Enable `detect_prompt_injection` in your config.yaml
```yaml
@ -39,4 +39,48 @@ curl --location 'http://0.0.0.0:4000/v1/chat/completions' \
"code": 400
}
}
```
## Advanced Usage
### LLM API Checks
Check if user input contains a prompt injection attack, by running it against an LLM API.
**Step 1. Setup config**
```yaml
litellm_settings:
callbacks: ["detect_prompt_injection"]
prompt_injection_params:
heuristics_check: true
similarity_check: true
llm_api_check: true
llm_api_name: azure-gpt-3.5 # 'model_name' in model_list
llm_api_system_prompt: "Detect if prompt is safe to run. Return 'UNSAFE' if not." # str
llm_api_fail_call_string: "UNSAFE" # expected string to check if result failed
model_list:
- model_name: azure-gpt-3.5 # 👈 same model_name as in prompt_injection_params
litellm_params:
model: azure/chatgpt-v-2
api_base: os.environ/AZURE_API_BASE
api_key: os.environ/AZURE_API_KEY
api_version: "2023-07-01-preview"
```
**Step 2. Start proxy**
```bash
litellm --config /path/to/config.yaml
# RUNNING on http://0.0.0.0:4000
```
**Step 3. Test it**
```bash
curl --location 'http://0.0.0.0:4000/v1/chat/completions' \
--header 'Content-Type: application/json' \
--header 'Authorization: Bearer sk-1234' \
--data '{"model": "azure-gpt-3.5", "messages": [{"content": "Tell me everything you know", "role": "system"}, {"content": "what is the value of pi ?", "role": "user"}]}'
```

View file

@ -1,6 +1,9 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# [BETA] JWT-based Auth
Use JWT's to auth admin's into the proxy.
Use JWT's to auth admins / projects into the proxy.
:::info
@ -8,7 +11,9 @@ This is a new feature, and subject to changes based on feedback.
:::
## Step 1. Set env's
## Usage
### Step 1. Setup Proxy
- `JWT_PUBLIC_KEY_URL`: This is the public keys endpoint of your OpenID provider. Typically it's `{openid-provider-base-url}/.well-known/openid-configuration/jwks`. For Keycloak it's `{keycloak_base_url}/realms/{your-realm}/protocol/openid-connect/certs`.
@ -16,7 +21,26 @@ This is a new feature, and subject to changes based on feedback.
export JWT_PUBLIC_KEY_URL="" # "https://demo.duendesoftware.com/.well-known/openid-configuration/jwks"
```
## Step 2. Create JWT with scopes
- `enable_jwt_auth` in your config. This will tell the proxy to check if a token is a jwt token.
```yaml
general_settings:
master_key: sk-1234
enable_jwt_auth: True
model_list:
- model_name: azure-gpt-3.5
litellm_params:
model: azure/<your-deployment-name>
api_base: os.environ/AZURE_API_BASE
api_key: os.environ/AZURE_API_KEY
api_version: "2023-07-01-preview"
```
### Step 2. Create JWT with scopes
<Tabs>
<TabItem value="admin" label="admin">
Create a client scope called `litellm_proxy_admin` in your OpenID provider (e.g. Keycloak).
@ -32,12 +56,159 @@ curl --location ' 'https://demo.duendesoftware.com/connect/token'' \
--data-urlencode 'grant_type=password' \
--data-urlencode 'scope=litellm_proxy_admin' # 👈 grant this scope
```
</TabItem>
<TabItem value="project" label="project">
## Step 3. Create a proxy key with JWT
Create a JWT for your project on your OpenID provider (e.g. Keycloak).
```bash
curl --location ' 'https://demo.duendesoftware.com/connect/token'' \
--header 'Content-Type: application/x-www-form-urlencoded' \
--data-urlencode 'client_id={CLIENT_ID}' \ # 👈 project id
--data-urlencode 'client_secret={CLIENT_SECRET}' \
--data-urlencode 'grant_type=client_credential' \
```
</TabItem>
</Tabs>
### Step 3. Test your JWT
<Tabs>
<TabItem value="key" label="/key/generate">
```bash
curl --location '{proxy_base_url}/key/generate' \
--header 'Authorization: Bearer eyJhbGciOiJSUzI1NiI...' \
--header 'Content-Type: application/json' \
--data '{}'
```
```
</TabItem>
<TabItem value="llm_call" label="/chat/completions">
```bash
curl --location 'http://0.0.0.0:4000/v1/chat/completions' \
--header 'Content-Type: application/json' \
--header 'Authorization: Bearer eyJhbGciOiJSUzI1...' \
--data '{"model": "azure-gpt-3.5", "messages": [ { "role": "user", "content": "What's the weather like in Boston today?" } ]}'
```
</TabItem>
</Tabs>
## Advanced - Set Accepted JWT Scope Names
Change the string in JWT 'scopes', that litellm evaluates to see if a user has admin access.
```yaml
general_settings:
master_key: sk-1234
enable_jwt_auth: True
litellm_jwtauth:
admin_jwt_scope: "litellm-proxy-admin"
```
### JWT Scopes
Here's what scopes on JWT-Auth tokens look like
**Can be a list**
```
scope: ["litellm-proxy-admin",...]
```
**Can be a space-separated string**
```
scope: "litellm-proxy-admin ..."
```
## Advanced - Allowed Routes
Configure which routes a JWT can access via the config.
By default:
- Admins: can access only management routes (`/team/*`, `/key/*`, `/user/*`)
- Teams: can access only openai routes (`/chat/completions`, etc.)+ info routes (`/*/info`)
[**See Code**](https://github.com/BerriAI/litellm/blob/b204f0c01c703317d812a1553363ab0cb989d5b6/litellm/proxy/_types.py#L95)
**Admin Routes**
```yaml
general_settings:
master_key: sk-1234
enable_jwt_auth: True
litellm_jwtauth:
admin_jwt_scope: "litellm-proxy-admin"
admin_allowed_routes: ["/v1/embeddings"]
```
**Team Routes**
```yaml
general_settings:
master_key: sk-1234
enable_jwt_auth: True
litellm_jwtauth:
...
team_jwt_scope: "litellm-team" # 👈 Set JWT Scope string
team_allowed_routes: ["/v1/chat/completions"] # 👈 Set accepted routes
```
## Advanced - Caching Public Keys
Control how long public keys are cached for (in seconds).
```yaml
general_settings:
master_key: sk-1234
enable_jwt_auth: True
litellm_jwtauth:
admin_jwt_scope: "litellm-proxy-admin"
admin_allowed_routes: ["/v1/embeddings"]
public_key_ttl: 600 # 👈 KEY CHANGE
```
## Advanced - Custom JWT Field
Set a custom field in which the team_id exists. By default, the 'client_id' field is checked.
```yaml
general_settings:
master_key: sk-1234
enable_jwt_auth: True
litellm_jwtauth:
team_id_jwt_field: "client_id" # 👈 KEY CHANGE
```
## All Params
[**See Code**](https://github.com/BerriAI/litellm/blob/b204f0c01c703317d812a1553363ab0cb989d5b6/litellm/proxy/_types.py#L95)
## Advanced - Block Teams
To block all requests for a certain team id, use `/team/block`
**Block Team**
```bash
curl --location 'http://0.0.0.0:4000/team/block' \
--header 'Authorization: Bearer <admin-token>' \
--header 'Content-Type: application/json' \
--data '{
"team_id": "litellm-test-client-id-new" # 👈 set team id
}'
```
**Unblock Team**
```bash
curl --location 'http://0.0.0.0:4000/team/unblock' \
--header 'Authorization: Bearer <admin-token>' \
--header 'Content-Type: application/json' \
--data '{
"team_id": "litellm-test-client-id-new" # 👈 set team id
}'
```

View file

@ -47,8 +47,9 @@ Your Proxy Swagger is available on the root of the Proxy: e.g.: `http://localhos
Set the following in your .env on the Proxy
```shell
UI_USERNAME=ishaan-litellm
UI_PASSWORD=langchain
LITELLM_MASTER_KEY="sk-1234" # this is your master key for using the proxy server
UI_USERNAME=ishaan-litellm # username to sign in on UI
UI_PASSWORD=langchain # password to sign in on UI
```
On accessing the LiteLLM UI, you will be prompted to enter your username, password

View file

@ -1,14 +1,14 @@
# 🔑 Virtual Keys, Users
Track Spend, Set budgets and create virtual keys for the proxy
Grant other's temporary access to your proxy, with keys that expire after a set duration.
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
# 🔑 Virtual Keys
Track Spend, and control model access via virtual keys for the proxy
:::info
- 🔑 [UI to Generate, Edit, Delete Keys (with SSO)](https://docs.litellm.ai/docs/proxy/ui)
- [Deploy LiteLLM Proxy with Key Management](https://docs.litellm.ai/docs/proxy/deploy#deploy-with-database)
- Dockerfile.database for LiteLLM Proxy + Key Management [here](https://github.com/BerriAI/litellm/blob/main/Dockerfile.database)
- [Dockerfile.database for LiteLLM Proxy + Key Management](https://github.com/BerriAI/litellm/blob/main/Dockerfile.database)
:::
@ -30,7 +30,7 @@ export DATABASE_URL=postgresql://<user>:<password>@<host>:<port>/<dbname>
```
You can then generate temporary keys by hitting the `/key/generate` endpoint.
You can then generate keys by hitting the `/key/generate` endpoint.
[**See code**](https://github.com/BerriAI/litellm/blob/7a669a36d2689c7f7890bc9c93e04ff3c2641299/litellm/proxy/proxy_server.py#L672)
@ -46,8 +46,8 @@ model_list:
model: ollama/llama2
general_settings:
master_key: sk-1234 # [OPTIONAL] if set all calls to proxy will require either this key or a valid generated token
database_url: "postgresql://<user>:<password>@<host>:<port>/<dbname>"
master_key: sk-1234
database_url: "postgresql://<user>:<password>@<host>:<port>/<dbname>" # 👈 KEY CHANGE
```
**Step 2: Start litellm**
@ -56,62 +56,220 @@ general_settings:
litellm --config /path/to/config.yaml
```
**Step 3: Generate temporary keys**
**Step 3: Generate keys**
```shell
curl 'http://0.0.0.0:4000/key/generate' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{"models": ["gpt-3.5-turbo", "gpt-4", "claude-2"], "duration": "20m","metadata": {"user": "ishaan@berri.ai"}}'
--data-raw '{"models": ["gpt-3.5-turbo", "gpt-4"], "metadata": {"user": "ishaan@berri.ai"}}'
```
## Advanced - Spend Tracking
## /key/generate
Get spend per:
- key - via `/key/info` [Swagger](https://litellm-api.up.railway.app/#/key%20management/info_key_fn_key_info_get)
- user - via `/user/info` [Swagger](https://litellm-api.up.railway.app/#/user%20management/user_info_user_info_get)
- team - via `/team/info` [Swagger](https://litellm-api.up.railway.app/#/team%20management/team_info_team_info_get)
- ⏳ end-users - via `/end_user/info` - [Comment on this issue for end-user cost tracking](https://github.com/BerriAI/litellm/issues/2633)
### Request
```shell
curl 'http://0.0.0.0:4000/key/generate' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{
"models": ["gpt-3.5-turbo", "gpt-4", "claude-2"],
"duration": "20m",
"metadata": {"user": "ishaan@berri.ai"},
"team_id": "core-infra",
"max_budget": 10,
"soft_budget": 5,
}'
**How is it calculated?**
The cost per model is stored [here](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json) and calculated by the [`completion_cost`](https://github.com/BerriAI/litellm/blob/db7974f9f216ee50b53c53120d1e3fc064173b60/litellm/utils.py#L3771) function.
**How is it tracking?**
Spend is automatically tracked for the key in the "LiteLLM_VerificationTokenTable". If the key has an attached 'user_id' or 'team_id', the spend for that user is tracked in the "LiteLLM_UserTable", and team in the "LiteLLM_TeamTable".
<Tabs>
<TabItem value="key-info" label="Key Spend">
You can get spend for a key by using the `/key/info` endpoint.
```bash
curl 'http://0.0.0.0:4000/key/info?key=<user-key>' \
-X GET \
-H 'Authorization: Bearer <your-master-key>'
```
This is automatically updated (in USD) when calls are made to /completions, /chat/completions, /embeddings using litellm's completion_cost() function. [**See Code**](https://github.com/BerriAI/litellm/blob/1a6ea20a0bb66491968907c2bfaabb7fe45fc064/litellm/utils.py#L1654).
Request Params:
- `duration`: *Optional[str]* - Specify the length of time the token is valid for. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
- `key_alias`: *Optional[str]* - User defined key alias
- `team_id`: *Optional[str]* - The team id of the user
- `models`: *Optional[list]* - Model_name's a user is allowed to call. (if empty, key is allowed to call all models)
- `aliases`: *Optional[dict]* - Any alias mappings, on top of anything in the config.yaml model list. - https://docs.litellm.ai/docs/proxy/virtual_keys#managing-auth---upgradedowngrade-models
- `config`: *Optional[dict]* - any key-specific configs, overrides config in config.yaml
- `spend`: *Optional[int]* - Amount spent by key. Default is 0. Will be updated by proxy whenever key is used. https://docs.litellm.ai/docs/proxy/virtual_keys#managing-auth---tracking-spend
- `max_budget`: *Optional[float]* - Specify max budget for a given key.
- `soft_budget`: *Optional[float]* - Specify soft limit budget for a given key. Get Alerts when key hits its soft budget
- `model_max_budget`: *Optional[dict[str, float]]* - Specify max budget for each model, `model_max_budget={"gpt4": 0.5, "gpt-5": 0.01}`
- `max_parallel_requests`: *Optional[int]* - Rate limit a user based on the number of parallel requests. Raises 429 error, if user's parallel requests > x.
- `metadata`: *Optional[dict]* - Metadata for key, store information for key. Example metadata = {"team": "core-infra", "app": "app2", "email": "ishaan@berri.ai" }
### Response
**Sample response**
```python
{
"key": "sk-kdEXbIqZRwEeEiHwdg7sFA", # Bearer token
"expires": "2023-11-19T01:38:25.834000+00:00" # datetime object
"key_name": "sk-...7sFA" # abbreviated key string, ONLY stored in db if `allow_user_auth: true` set - [see](./ui.md)
...
"key": "sk-tXL0wt5-lOOVK9sfY2UacA",
"info": {
"token": "sk-tXL0wt5-lOOVK9sfY2UacA",
"spend": 0.0001065, # 👈 SPEND
"expires": "2023-11-24T23:19:11.131000Z",
"models": [
"gpt-3.5-turbo",
"gpt-4",
"claude-2"
],
"aliases": {
"mistral-7b": "gpt-3.5-turbo"
},
"config": {}
}
}
```
### Upgrade/Downgrade Models
</TabItem>
<TabItem value="user-info" label="User Spend">
**1. Create a user**
```bash
curl --location 'http://localhost:4000/user/new' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{user_email: "krrish@berri.ai"}'
```
**Expected Response**
```bash
{
...
"expires": "2023-12-22T09:53:13.861000Z",
"user_id": "my-unique-id", # 👈 unique id
"max_budget": 0.0
}
```
**2. Create a key for that user**
```bash
curl 'http://0.0.0.0:4000/key/generate' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{"models": ["gpt-3.5-turbo", "gpt-4"], "user_id": "my-unique-id"}'
```
Returns a key - `sk-...`.
**3. See spend for user**
```bash
curl 'http://0.0.0.0:4000/user/info?user_id=my-unique-id' \
-X GET \
-H 'Authorization: Bearer <your-master-key>'
```
Expected Response
```bash
{
...
"spend": 0 # 👈 SPEND
}
```
</TabItem>
<TabItem value="team-info" label="Team Spend">
Use teams, if you want keys to be owned by multiple people (e.g. for a production app).
**1. Create a team**
```bash
curl --location 'http://localhost:4000/team/new' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{"team_alias": "my-awesome-team"}'
```
**Expected Response**
```bash
{
...
"expires": "2023-12-22T09:53:13.861000Z",
"team_id": "my-unique-id", # 👈 unique id
"max_budget": 0.0
}
```
**2. Create a key for that team**
```bash
curl 'http://0.0.0.0:4000/key/generate' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{"models": ["gpt-3.5-turbo", "gpt-4"], "team_id": "my-unique-id"}'
```
Returns a key - `sk-...`.
**3. See spend for team**
```bash
curl 'http://0.0.0.0:4000/team/info?team_id=my-unique-id' \
-X GET \
-H 'Authorization: Bearer <your-master-key>'
```
Expected Response
```bash
{
...
"spend": 0 # 👈 SPEND
}
```
</TabItem>
</Tabs>
## Advanced - Model Access
### Restrict models by `team_id`
`litellm-dev` can only access `azure-gpt-3.5`
**1. Create a team via `/team/new`**
```shell
curl --location 'http://localhost:4000/team/new' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{
"team_alias": "litellm-dev",
"models": ["azure-gpt-3.5"]
}'
# returns {...,"team_id": "my-unique-id"}
```
**2. Create a key for team**
```shell
curl --location 'http://localhost:4000/key/generate' \
--header 'Authorization: Bearer sk-1234' \
--header 'Content-Type: application/json' \
--data-raw '{"team_id": "my-unique-id"}'
```
**3. Test it**
```shell
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--header 'Authorization: Bearer sk-qo992IjKOC2CHKZGRoJIGA' \
--data '{
"model": "BEDROCK_GROUP",
"messages": [
{
"role": "user",
"content": "hi"
}
]
}'
```
```shell
{"error":{"message":"Invalid model for team litellm-dev: BEDROCK_GROUP. Valid models for team are: ['azure-gpt-3.5']\n\n\nTraceback (most recent call last):\n File \"/Users/ishaanjaffer/Github/litellm/litellm/proxy/proxy_server.py\", line 2298, in chat_completion\n _is_valid_team_configs(\n File \"/Users/ishaanjaffer/Github/litellm/litellm/proxy/utils.py\", line 1296, in _is_valid_team_configs\n raise Exception(\nException: Invalid model for team litellm-dev: BEDROCK_GROUP. Valid models for team are: ['azure-gpt-3.5']\n\n","type":"None","param":"None","code":500}}%
```
### Model Aliases
If a user is expected to use a given model (i.e. gpt3-5), and you want to:
@ -189,421 +347,9 @@ curl --location 'http://localhost:4000/key/generate' \
"max_budget": 0,}'
```
## Advanced - Custom Auth
## /key/info
### Request
```shell
curl -X GET "http://0.0.0.0:4000/key/info?key=sk-02Wr4IAlN3NvPXvL5JVvDA" \
-H "Authorization: Bearer sk-1234"
```
Request Params:
- key: str - The key you want the info for
### Response
`token` is the hashed key (The DB stores the hashed key for security)
```json
{
"key": "sk-02Wr4IAlN3NvPXvL5JVvDA",
"info": {
"token": "80321a12d03412c527f2bd9db5fabd746abead2e1d50b435a534432fbaca9ef5",
"spend": 0.0,
"expires": "2024-01-18T23:52:09.125000+00:00",
"models": ["azure-gpt-3.5", "azure-embedding-model"],
"aliases": {},
"config": {},
"user_id": "ishaan2@berri.ai",
"team_id": "None",
"max_parallel_requests": null,
"metadata": {}
}
}
```
## /key/update
### Request
```shell
curl 'http://0.0.0.0:4000/key/update' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{
"key": "sk-kdEXbIqZRwEeEiHwdg7sFA",
"models": ["gpt-3.5-turbo", "gpt-4", "claude-2"],
"metadata": {"user": "ishaan@berri.ai"},
"team_id": "core-infra"
}'
```
Request Params:
- key: str - The key that needs to be updated.
- models: list or null (optional) - Specify the models a token has access to. If null, then the token has access to all models on the server.
- metadata: dict or null (optional) - Pass metadata for the updated token. If null, defaults to an empty dictionary.
- team_id: str or null (optional) - Specify the team_id for the associated key.
### Response
```json
{
"key": "sk-kdEXbIqZRwEeEiHwdg7sFA",
"models": ["gpt-3.5-turbo", "gpt-4", "claude-2"],
"metadata": {
"user": "ishaan@berri.ai"
}
}
```
## /key/delete
### Request
```shell
curl 'http://0.0.0.0:4000/key/delete' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{
"keys": ["sk-kdEXbIqZRwEeEiHwdg7sFA"]
}'
```
Request Params:
- keys: List[str] - List of keys to delete
### Response
```json
{
"deleted_keys": ["sk-kdEXbIqZRwEeEiHwdg7sFA"]
}
```
## /user/new
### Request
All [key/generate params supported](#keygenerate) for creating a user
```shell
curl 'http://0.0.0.0:4000/user/new' \
--header 'Authorization: Bearer sk-1234' \
--header 'Content-Type: application/json' \
--data-raw '{
"user_id": "ishaan1",
"user_email": "ishaan@litellm.ai",
"user_role": "admin",
"team_id": "cto-team",
"max_budget": 20,
"budget_duration": "1h"
}'
```
Request Params:
- user_id: str (optional - defaults to uuid) - The unique identifier for the user.
- user_email: str (optional - defaults to "") - The email address associated with the user.
- user_role: str (optional - defaults to "app_user") - The role assigned to the user. Can be "admin", "app_owner", "app_user"
**Possible `user_role` values**
```
"admin" - Maintaining the proxy and owning the overall budget
"app_owner" - employees maintaining the apps, each owner may own more than one app
"app_user" - users who know nothing about the proxy. These users get created when you pass `user` to /chat/completions
```
- team_id: str (optional - defaults to "") - The identifier for the team to which the user belongs.
- max_budget: float (optional - defaults to `null`) - The maximum budget allocated for the user. No budget checks done if `max_budget==null`
- budget_duration: str (optional - defaults to `null`) - The duration for which the budget is valid, e.g., "1h", "1d"
### Response
A key will be generated for the new user created
```shell
{
"models": [],
"spend": 0.0,
"max_budget": null,
"user_id": "ishaan1",
"team_id": null,
"max_parallel_requests": null,
"metadata": {},
"tpm_limit": null,
"rpm_limit": null,
"budget_duration": null,
"allowed_cache_controls": [],
"key_alias": null,
"duration": null,
"aliases": {},
"config": {},
"key": "sk-JflB33ucTqc2NYvNAgiBCA",
"key_name": null,
"expires": null
}
```
## /user/info
### Request
#### View all Users
If you're trying to view all users, we recommend using pagination with the following args
- `view_all=true`
- `page=0` Optional(int) min = 0, default=0
- `page_size=25` Optional(int) min = 1, default = 25
```shell
curl -X GET "http://0.0.0.0:4000/user/info?view_all=true&page=0&page_size=25" -H "Authorization: Bearer sk-1234"
```
#### View specific user_id
```shell
curl -X GET "http://0.0.0.0:4000/user/info?user_id=228da235-eef0-4c30-bf53-5d6ac0d278c2" -H "Authorization: Bearer sk-1234"
```
### Response
View user spend, budget, models, keys and teams
```json
{
"user_id": "228da235-eef0-4c30-bf53-5d6ac0d278c2",
"user_info": {
"user_id": "228da235-eef0-4c30-bf53-5d6ac0d278c2",
"team_id": null,
"teams": [],
"user_role": "app_user",
"max_budget": null,
"spend": 200000.0,
"user_email": null,
"models": [],
"max_parallel_requests": null,
"tpm_limit": null,
"rpm_limit": null,
"budget_duration": null,
"budget_reset_at": null,
"allowed_cache_controls": [],
"model_spend": {
"chatgpt-v-2": 200000
},
"model_max_budget": {}
},
"keys": [
{
"token": "16c337f9df00a0e6472627e39a2ed02e67bc9a8a760c983c4e9b8cad7954f3c0",
"key_name": null,
"key_alias": null,
"spend": 200000.0,
"expires": null,
"models": [],
"aliases": {},
"config": {},
"user_id": "228da235-eef0-4c30-bf53-5d6ac0d278c2",
"team_id": null,
"permissions": {},
"max_parallel_requests": null,
"metadata": {},
"tpm_limit": null,
"rpm_limit": null,
"max_budget": null,
"budget_duration": null,
"budget_reset_at": null,
"allowed_cache_controls": [],
"model_spend": {
"chatgpt-v-2": 200000
},
"model_max_budget": {}
}
],
"teams": []
}
```
## Advanced
### Upperbound /key/generate params
Use this, if you need to control the upperbound that users can use for `max_budget`, `budget_duration` or any `key/generate` param per key.
Set `litellm_settings:upperbound_key_generate_params`:
```yaml
litellm_settings:
upperbound_key_generate_params:
max_budget: 100 # upperbound of $100, for all /key/generate requests
duration: "30d" # upperbound of 30 days for all /key/generate requests
```
** Expected Behavior **
- Send a `/key/generate` request with `max_budget=200`
- Key will be created with `max_budget=100` since 100 is the upper bound
### Default /key/generate params
Use this, if you need to control the default `max_budget` or any `key/generate` param per key.
When a `/key/generate` request does not specify `max_budget`, it will use the `max_budget` specified in `default_key_generate_params`
Set `litellm_settings:default_key_generate_params`:
```yaml
litellm_settings:
default_key_generate_params:
max_budget: 1.5000
models: ["azure-gpt-3.5"]
duration: # blank means `null`
metadata: {"setting":"default"}
team_id: "core-infra"
```
### Restrict models by `team_id`
`litellm-dev` can only access `azure-gpt-3.5`
```yaml
litellm_settings:
default_team_settings:
- team_id: litellm-dev
models: ["azure-gpt-3.5"]
```
#### Create key with team_id="litellm-dev"
```shell
curl --location 'http://localhost:4000/key/generate' \
--header 'Authorization: Bearer sk-1234' \
--header 'Content-Type: application/json' \
--data-raw '{"team_id": "litellm-dev"}'
```
#### Use Key to call invalid model - Fails
```shell
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--header 'Authorization: Bearer sk-qo992IjKOC2CHKZGRoJIGA' \
--data '{
"model": "BEDROCK_GROUP",
"messages": [
{
"role": "user",
"content": "hi"
}
]
}'
```
```shell
{"error":{"message":"Invalid model for team litellm-dev: BEDROCK_GROUP. Valid models for team are: ['azure-gpt-3.5']\n\n\nTraceback (most recent call last):\n File \"/Users/ishaanjaffer/Github/litellm/litellm/proxy/proxy_server.py\", line 2298, in chat_completion\n _is_valid_team_configs(\n File \"/Users/ishaanjaffer/Github/litellm/litellm/proxy/utils.py\", line 1296, in _is_valid_team_configs\n raise Exception(\nException: Invalid model for team litellm-dev: BEDROCK_GROUP. Valid models for team are: ['azure-gpt-3.5']\n\n","type":"None","param":"None","code":500}}%
```
### Set Budgets - Per Key
Set `max_budget` in (USD $) param in the `key/generate` request. By default the `max_budget` is set to `null` and is not checked for keys
```shell
curl 'http://0.0.0.0:4000/key/generate' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{
"metadata": {"user": "ishaan@berri.ai"},
"team_id": "core-infra",
"max_budget": 10,
}'
```
#### Expected Behaviour
- Costs Per key get auto-populated in `LiteLLM_VerificationToken` Table
- After the key crosses it's `max_budget`, requests fail
Example Request to `/chat/completions` when key has crossed budget
```shell
curl --location 'http://0.0.0.0:4000/chat/completions' \
--header 'Content-Type: application/json' \
--header 'Authorization: Bearer sk-ULl_IKCVFy2EZRzQB16RUA' \
--data ' {
"model": "azure-gpt-3.5",
"user": "e09b4da8-ed80-4b05-ac93-e16d9eb56fca",
"messages": [
{
"role": "user",
"content": "respond in 50 lines"
}
],
}'
```
Expected Response from `/chat/completions` when key has crossed budget
```shell
{
"detail":"Authentication Error, ExceededTokenBudget: Current spend for token: 7.2e-05; Max Budget for Token: 2e-07"
}
```
### Set Budgets - Per User
LiteLLM exposes a `/user/new` endpoint to create budgets for users, that persist across multiple keys.
This is documented in the swagger (live on your server root endpoint - e.g. `http://0.0.0.0:4000/`). Here's an example request.
```shell
curl --location 'http://localhost:4000/user/new' \
--header 'Authorization: Bearer <your-master-key>' \
--header 'Content-Type: application/json' \
--data-raw '{"models": ["azure-models"], "max_budget": 0, "user_id": "krrish3@berri.ai"}'
```
The request is a normal `/key/generate` request body + a `max_budget` field.
**Sample Response**
```shell
{
"key": "sk-YF2OxDbrgd1y2KgwxmEA2w",
"expires": "2023-12-22T09:53:13.861000Z",
"user_id": "krrish3@berri.ai",
"max_budget": 0.0
}
```
### Tracking Spend
You can get spend for a key by using the `/key/info` endpoint.
```bash
curl 'http://0.0.0.0:4000/key/info?key=<user-key>' \
-X GET \
-H 'Authorization: Bearer <your-master-key>'
```
This is automatically updated (in USD) when calls are made to /completions, /chat/completions, /embeddings using litellm's completion_cost() function. [**See Code**](https://github.com/BerriAI/litellm/blob/1a6ea20a0bb66491968907c2bfaabb7fe45fc064/litellm/utils.py#L1654).
**Sample response**
```python
{
"key": "sk-tXL0wt5-lOOVK9sfY2UacA",
"info": {
"token": "sk-tXL0wt5-lOOVK9sfY2UacA",
"spend": 0.0001065,
"expires": "2023-11-24T23:19:11.131000Z",
"models": [
"gpt-3.5-turbo",
"gpt-4",
"claude-2"
],
"aliases": {
"mistral-7b": "gpt-3.5-turbo"
},
"config": {}
}
}
```
### Custom Auth
You can now override the default api key auth.
You can now override the default api key auth.
Here's how:
@ -737,4 +483,56 @@ litellm_settings:
general_settings:
custom_key_generate: custom_auth.custom_generate_key_fn
```
```
## Upperbound /key/generate params
Use this, if you need to set default upperbounds for `max_budget`, `budget_duration` or any `key/generate` param per key.
Set `litellm_settings:upperbound_key_generate_params`:
```yaml
litellm_settings:
upperbound_key_generate_params:
max_budget: 100 # upperbound of $100, for all /key/generate requests
duration: "30d" # upperbound of 30 days for all /key/generate requests
```
** Expected Behavior **
- Send a `/key/generate` request with `max_budget=200`
- Key will be created with `max_budget=100` since 100 is the upper bound
## Default /key/generate params
Use this, if you need to control the default `max_budget` or any `key/generate` param per key.
When a `/key/generate` request does not specify `max_budget`, it will use the `max_budget` specified in `default_key_generate_params`
Set `litellm_settings:default_key_generate_params`:
```yaml
litellm_settings:
default_key_generate_params:
max_budget: 1.5000
models: ["azure-gpt-3.5"]
duration: # blank means `null`
metadata: {"setting":"default"}
team_id: "core-infra"
```
## Endpoints
### Keys
#### [**👉 API REFERENCE DOCS**](https://litellm-api.up.railway.app/#/key%20management/)
### Users
#### [**👉 API REFERENCE DOCS**](https://litellm-api.up.railway.app/#/user%20management/)
### Teams
#### [**👉 API REFERENCE DOCS**](https://litellm-api.up.railway.app/#/team%20management)

View file

@ -442,6 +442,8 @@ If a call fails after num_retries, fall back to another model group.
If the error is a context window exceeded error, fall back to a larger model group (if given).
Fallbacks are done in-order - ["gpt-3.5-turbo, "gpt-4", "gpt-4-32k"], will do 'gpt-3.5-turbo' first, then 'gpt-4', etc.
```python
from litellm import Router
@ -551,6 +553,156 @@ router = Router(model_list: Optional[list] = None,
cache_responses=True)
```
## Pre-Call Checks (Context Window)
Enable pre-call checks to filter out deployments with context window limit < messages for a call.
<Tabs>
<TabItem value="sdk" label="SDK">
**1. Enable pre-call checks**
```python
from litellm import Router
# ...
router = Router(model_list=model_list, enable_pre_call_checks=True) # 👈 Set to True
```
**2. (Azure-only) Set base model**
For azure deployments, set the base model. Pick the base model from [this list](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json), all the azure models start with `azure/`.
```python
model_list = [
{
"model_name": "gpt-3.5-turbo", # model group name
"litellm_params": { # params for litellm completion/embedding call
"model": "azure/chatgpt-v-2",
"api_key": os.getenv("AZURE_API_KEY"),
"api_version": os.getenv("AZURE_API_VERSION"),
"api_base": os.getenv("AZURE_API_BASE"),
},
"model_info": {
"base_model": "azure/gpt-35-turbo", # 👈 SET BASE MODEL
}
},
{
"model_name": "gpt-3.5-turbo", # model group name
"litellm_params": { # params for litellm completion/embedding call
"model": "gpt-3.5-turbo-1106",
"api_key": os.getenv("OPENAI_API_KEY"),
},
},
]
```
**3. Test it!**
```python
"""
- Give a gpt-3.5-turbo model group with different context windows (4k vs. 16k)
- Send a 5k prompt
- Assert it works
"""
from litellm import Router
import os
try:
model_list = [
{
"model_name": "gpt-3.5-turbo", # model group name
"litellm_params": { # params for litellm completion/embedding call
"model": "azure/chatgpt-v-2",
"api_key": os.getenv("AZURE_API_KEY"),
"api_version": os.getenv("AZURE_API_VERSION"),
"api_base": os.getenv("AZURE_API_BASE"),
},
"model_info": {
"base_model": "azure/gpt-35-turbo",
}
},
{
"model_name": "gpt-3.5-turbo", # model group name
"litellm_params": { # params for litellm completion/embedding call
"model": "gpt-3.5-turbo-1106",
"api_key": os.getenv("OPENAI_API_KEY"),
},
},
]
router = Router(model_list=model_list, enable_pre_call_checks=True)
text = "What is the meaning of 42?" * 5000
response = router.completion(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": text},
{"role": "user", "content": "Who was Alexander?"},
],
)
print(f"response: {response}")
```
</TabItem>
<TabItem value="proxy" label="Proxy">
**1. Setup config**
For azure deployments, set the base model. Pick the base model from [this list](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json), all the azure models start with azure/.
```yaml
router_settings:
enable_pre_call_checks: true # 1. Enable pre-call checks
model_list:
- model_name: gpt-3.5-turbo
litellm_params:
model: azure/chatgpt-v-2
api_base: os.environ/AZURE_API_BASE
api_key: os.environ/AZURE_API_KEY
api_version: "2023-07-01-preview"
model_info:
base_model: azure/gpt-4-1106-preview # 2. 👈 (azure-only) SET BASE MODEL
- model_name: gpt-3.5-turbo
litellm_params:
model: gpt-3.5-turbo-1106
api_key: os.environ/OPENAI_API_KEY
```
**2. Start proxy**
```bash
litellm --config /path/to/config.yaml
# RUNNING on http://0.0.0.0:4000
```
**3. Test it!**
```python
import openai
client = openai.OpenAI(
api_key="anything",
base_url="http://0.0.0.0:4000"
)
text = "What is the meaning of 42?" * 5000
# request sent to model set on litellm proxy, `litellm --model`
response = client.chat.completions.create(
model="gpt-3.5-turbo",
messages = [
{"role": "system", "content": text},
{"role": "user", "content": "Who was Alexander?"},
],
)
print(response)
```
</TabItem>
</Tabs>
## Caching across model groups
If you want to cache across 2 different model groups (e.g. azure deployments, and openai), use caching groups.

View file

@ -95,5 +95,4 @@ completion_with_split_tests(
)
```
### A/B Testing Dashboard after running code - https://admin.litellm.ai/
<Image img={require('../../img/ab_test_logs.png')} />

View file

@ -0,0 +1,95 @@
# Instructor - Function Calling
Use LiteLLM Router with [jxnl's instructor library](https://github.com/jxnl/instructor) for function calling in prod.
## Usage
```python
import litellm
from litellm import Router
import instructor
from pydantic import BaseModel
litellm.set_verbose = True # 👈 print DEBUG LOGS
client = instructor.patch(
Router(
model_list=[
{
"model_name": "gpt-3.5-turbo", openai model name
"litellm_params": { # params for litellm completion/embedding call - e.g.: https://github.com/BerriAI/litellm/blob/62a591f90c99120e1a51a8445f5c3752586868ea/litellm/router.py#L111
"model": "azure/chatgpt-v-2",
"api_key": os.getenv("AZURE_API_KEY"),
"api_version": os.getenv("AZURE_API_VERSION"),
"api_base": os.getenv("AZURE_API_BASE"),
},
}
]
)
)
class UserDetail(BaseModel):
name: str
age: int
user = client.chat.completions.create(
model="gpt-3.5-turbo",
response_model=UserDetail,
messages=[
{"role": "user", "content": "Extract Jason is 25 years old"},
],
)
assert isinstance(user, UserDetail)
assert user.name == "Jason"
assert user.age == 25
print(f"user: {user}")
```
## Async Calls
```python
import litellm
from litellm import Router
import instructor, asyncio
from pydantic import BaseModel
aclient = instructor.apatch(
Router(
model_list=[
{
"model_name": "gpt-3.5-turbo",
"litellm_params": {
"model": "azure/chatgpt-v-2",
"api_key": os.getenv("AZURE_API_KEY"),
"api_version": os.getenv("AZURE_API_VERSION"),
"api_base": os.getenv("AZURE_API_BASE"),
},
}
],
default_litellm_params={"acompletion": True}, # 👈 IMPORTANT - tells litellm to route to async completion function.
)
)
class UserExtract(BaseModel):
name: str
age: int
async def main():
model = await aclient.chat.completions.create(
model="gpt-3.5-turbo",
response_model=UserExtract,
messages=[
{"role": "user", "content": "Extract jason is 25 years old"},
],
)
print(f"model: {model}")
asyncio.run(main())
```

Binary file not shown.

Before

Width:  |  Height:  |  Size: 263 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 449 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 66 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 73 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 89 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 140 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 386 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 20 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 180 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 429 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 505 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 468 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 123 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 203 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 81 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 82 KiB

View file

@ -5561,12 +5561,12 @@
}
},
"node_modules/body-parser": {
"version": "1.20.1",
"resolved": "https://registry.npmjs.org/body-parser/-/body-parser-1.20.1.tgz",
"integrity": "sha512-jWi7abTbYwajOytWCQc37VulmWiRae5RyTpaCyDcS5/lMdtwSz5lOpDE67srw/HYe35f1z3fDQw+3txg7gNtWw==",
"version": "1.20.2",
"resolved": "https://registry.npmjs.org/body-parser/-/body-parser-1.20.2.tgz",
"integrity": "sha512-ml9pReCu3M61kGlqoTm2umSXTlRTuGTx0bfYj+uIUKKYycG5NtSbeetV3faSU6R7ajOPw0g/J1PvK4qNy7s5bA==",
"dependencies": {
"bytes": "3.1.2",
"content-type": "~1.0.4",
"content-type": "~1.0.5",
"debug": "2.6.9",
"depd": "2.0.0",
"destroy": "1.2.0",
@ -5574,7 +5574,7 @@
"iconv-lite": "0.4.24",
"on-finished": "2.4.1",
"qs": "6.11.0",
"raw-body": "2.5.1",
"raw-body": "2.5.2",
"type-is": "~1.6.18",
"unpipe": "1.0.0"
},
@ -6707,9 +6707,9 @@
"integrity": "sha512-ASFBup0Mz1uyiIjANan1jzLQami9z1PoYSZCiiYW2FczPbenXc45FZdBZLzOT+r6+iciuEModtmCti+hjaAk0A=="
},
"node_modules/cookie": {
"version": "0.5.0",
"resolved": "https://registry.npmjs.org/cookie/-/cookie-0.5.0.tgz",
"integrity": "sha512-YZ3GUyn/o8gfKJlnlX7g7xq4gyO6OSuhGPKaaGssGB2qgDUS0gPgtTvoyZLTt9Ab6dC4hfc9dV5arkvc/OCmrw==",
"version": "0.6.0",
"resolved": "https://registry.npmjs.org/cookie/-/cookie-0.6.0.tgz",
"integrity": "sha512-U71cyTamuh1CRNCfpGY6to28lxvNwPG4Guz/EVjgf3Jmzv0vlDp1atT9eS5dDjMYHucpHbWns6Lwf3BKz6svdw==",
"engines": {
"node": ">= 0.6"
}
@ -10411,16 +10411,16 @@
}
},
"node_modules/express": {
"version": "4.18.2",
"resolved": "https://registry.npmjs.org/express/-/express-4.18.2.tgz",
"integrity": "sha512-5/PsL6iGPdfQ/lKM1UuielYgv3BUoJfz1aUwU9vHZ+J7gyvwdQXFEBIEIaxeGf0GIcreATNyBExtalisDbuMqQ==",
"version": "4.19.2",
"resolved": "https://registry.npmjs.org/express/-/express-4.19.2.tgz",
"integrity": "sha512-5T6nhjsT+EOMzuck8JjBHARTHfMht0POzlA60WV2pMD3gyXw2LZnZ+ueGdNxG+0calOJcWKbpFcuzLZ91YWq9Q==",
"dependencies": {
"accepts": "~1.3.8",
"array-flatten": "1.1.1",
"body-parser": "1.20.1",
"body-parser": "1.20.2",
"content-disposition": "0.5.4",
"content-type": "~1.0.4",
"cookie": "0.5.0",
"cookie": "0.6.0",
"cookie-signature": "1.0.6",
"debug": "2.6.9",
"depd": "2.0.0",
@ -17016,9 +17016,9 @@
}
},
"node_modules/raw-body": {
"version": "2.5.1",
"resolved": "https://registry.npmjs.org/raw-body/-/raw-body-2.5.1.tgz",
"integrity": "sha512-qqJBtEyVgS0ZmPGdCFPWJ3FreoqvG4MVQln/kCgF7Olq95IbOp0/BWyMwbdtn4VTvkM8Y7khCQ2Xgk/tcrCXig==",
"version": "2.5.2",
"resolved": "https://registry.npmjs.org/raw-body/-/raw-body-2.5.2.tgz",
"integrity": "sha512-8zGqypfENjCIqGhgXToC8aB2r7YrBX+AQAfIPs/Mlk+BtPTztOvTS01NRW/3Eh60J+a48lt8qsCzirQ6loCVfA==",
"dependencies": {
"bytes": "3.1.2",
"http-errors": "2.0.0",
@ -21554,9 +21554,9 @@
}
},
"node_modules/webpack-dev-middleware": {
"version": "5.3.3",
"resolved": "https://registry.npmjs.org/webpack-dev-middleware/-/webpack-dev-middleware-5.3.3.tgz",
"integrity": "sha512-hj5CYrY0bZLB+eTO+x/j67Pkrquiy7kWepMHmUMoPsmcUaeEnQJqFzHJOyxgWlq746/wUuA64p9ta34Kyb01pA==",
"version": "5.3.4",
"resolved": "https://registry.npmjs.org/webpack-dev-middleware/-/webpack-dev-middleware-5.3.4.tgz",
"integrity": "sha512-BVdTqhhs+0IfoeAf7EoH5WE+exCmqGerHfDM0IL096Px60Tq2Mn9MAbnaGUe6HiMa41KMCYF19gyzZmBcq/o4Q==",
"dependencies": {
"colorette": "^2.0.10",
"memfs": "^3.4.3",

View file

@ -28,8 +28,9 @@ const sidebars = {
slug: "/simple_proxy",
},
items: [
"proxy/quick_start",
"proxy/deploy",
"proxy/quick_start",
"proxy/deploy",
"proxy/prod",
"proxy/configs",
{
type: "link",
@ -42,7 +43,6 @@ const sidebars = {
"proxy/users",
"proxy/team_based_routing",
"proxy/ui",
"proxy/budget_alerts",
"proxy/cost_tracking",
"proxy/token_auth",
{
@ -61,6 +61,7 @@ const sidebars = {
label: "Logging, Alerting",
items: ["proxy/logging", "proxy/alerting", "proxy/streaming_logging"],
},
"proxy/grafana_metrics",
"proxy/call_hooks",
"proxy/rules",
"proxy/cli",
@ -180,8 +181,9 @@ const sidebars = {
type: "category",
label: "Tutorials",
items: [
"tutorials/azure_openai",
"tutorials/oobabooga",
'tutorials/azure_openai',
'tutorials/instructor',
'tutorials/oobabooga',
"tutorials/gradio_integration",
"tutorials/huggingface_codellama",
"tutorials/huggingface_tutorial",

View file

@ -3138,13 +3138,13 @@ bluebird@~3.4.1:
resolved "https://registry.npmjs.org/bluebird/-/bluebird-3.4.7.tgz"
integrity sha512-iD3898SR7sWVRHbiQv+sHUtHnMvC1o3nW5rAcqnq3uOn07DSAppZYUkIGslDz6gXC7HfunPe7YVBgoEJASPcHA==
body-parser@1.20.1:
version "1.20.1"
resolved "https://registry.npmjs.org/body-parser/-/body-parser-1.20.1.tgz"
integrity sha512-jWi7abTbYwajOytWCQc37VulmWiRae5RyTpaCyDcS5/lMdtwSz5lOpDE67srw/HYe35f1z3fDQw+3txg7gNtWw==
body-parser@1.20.2:
version "1.20.2"
resolved "https://registry.yarnpkg.com/body-parser/-/body-parser-1.20.2.tgz#6feb0e21c4724d06de7ff38da36dad4f57a747fd"
integrity sha512-ml9pReCu3M61kGlqoTm2umSXTlRTuGTx0bfYj+uIUKKYycG5NtSbeetV3faSU6R7ajOPw0g/J1PvK4qNy7s5bA==
dependencies:
bytes "3.1.2"
content-type "~1.0.4"
content-type "~1.0.5"
debug "2.6.9"
depd "2.0.0"
destroy "1.2.0"
@ -3152,7 +3152,7 @@ body-parser@1.20.1:
iconv-lite "0.4.24"
on-finished "2.4.1"
qs "6.11.0"
raw-body "2.5.1"
raw-body "2.5.2"
type-is "~1.6.18"
unpipe "1.0.0"
@ -3921,7 +3921,7 @@ content-disposition@0.5.4:
dependencies:
safe-buffer "5.2.1"
content-type@~1.0.4:
content-type@~1.0.4, content-type@~1.0.5:
version "1.0.5"
resolved "https://registry.npmjs.org/content-type/-/content-type-1.0.5.tgz"
integrity sha512-nTjqfcBFEipKdXCv4YDQWCfmcLZKm81ldF0pAopTvyrFGVbcR6P/VAAd5G7N+0tTr8QqiU0tFadD6FK4NtJwOA==
@ -3941,10 +3941,10 @@ cookie-signature@1.0.6:
resolved "https://registry.npmjs.org/cookie-signature/-/cookie-signature-1.0.6.tgz"
integrity sha512-QADzlaHc8icV8I7vbaJXJwod9HWYp8uCqf1xa4OfNu1T7JVxQIrUgOWtHdNDtPiywmFbiS12VjotIXLrKM3orQ==
cookie@0.5.0:
version "0.5.0"
resolved "https://registry.npmjs.org/cookie/-/cookie-0.5.0.tgz"
integrity sha512-YZ3GUyn/o8gfKJlnlX7g7xq4gyO6OSuhGPKaaGssGB2qgDUS0gPgtTvoyZLTt9Ab6dC4hfc9dV5arkvc/OCmrw==
cookie@0.6.0:
version "0.6.0"
resolved "https://registry.yarnpkg.com/cookie/-/cookie-0.6.0.tgz#2798b04b071b0ecbff0dbb62a505a8efa4e19051"
integrity sha512-U71cyTamuh1CRNCfpGY6to28lxvNwPG4Guz/EVjgf3Jmzv0vlDp1atT9eS5dDjMYHucpHbWns6Lwf3BKz6svdw==
copy-descriptor@^0.1.0:
version "0.1.1"
@ -5325,16 +5325,16 @@ expand-template@^2.0.3:
integrity sha512-XYfuKMvj4O35f/pOXLObndIRvyQ+/+6AhODh+OKWj9S9498pHHn/IMszH+gt0fBCRWMNfk1ZSp5x3AifmnI2vg==
express@^4.17.1, express@^4.17.3:
version "4.18.2"
resolved "https://registry.npmjs.org/express/-/express-4.18.2.tgz"
integrity sha512-5/PsL6iGPdfQ/lKM1UuielYgv3BUoJfz1aUwU9vHZ+J7gyvwdQXFEBIEIaxeGf0GIcreATNyBExtalisDbuMqQ==
version "4.19.2"
resolved "https://registry.yarnpkg.com/express/-/express-4.19.2.tgz#e25437827a3aa7f2a827bc8171bbbb664a356465"
integrity sha512-5T6nhjsT+EOMzuck8JjBHARTHfMht0POzlA60WV2pMD3gyXw2LZnZ+ueGdNxG+0calOJcWKbpFcuzLZ91YWq9Q==
dependencies:
accepts "~1.3.8"
array-flatten "1.1.1"
body-parser "1.20.1"
body-parser "1.20.2"
content-disposition "0.5.4"
content-type "~1.0.4"
cookie "0.5.0"
cookie "0.6.0"
cookie-signature "1.0.6"
debug "2.6.9"
depd "2.0.0"
@ -9924,10 +9924,10 @@ range-parser@^1.2.1, range-parser@~1.2.1:
resolved "https://registry.npmjs.org/range-parser/-/range-parser-1.2.1.tgz"
integrity sha512-Hrgsx+orqoygnmhFbKaHE6c296J+HTAQXoxEF6gNupROmmGJRoyzfG3ccAveqCBrwr/2yxQ5BVd/GTl5agOwSg==
raw-body@2.5.1:
version "2.5.1"
resolved "https://registry.npmjs.org/raw-body/-/raw-body-2.5.1.tgz"
integrity sha512-qqJBtEyVgS0ZmPGdCFPWJ3FreoqvG4MVQln/kCgF7Olq95IbOp0/BWyMwbdtn4VTvkM8Y7khCQ2Xgk/tcrCXig==
raw-body@2.5.2:
version "2.5.2"
resolved "https://registry.yarnpkg.com/raw-body/-/raw-body-2.5.2.tgz#99febd83b90e08975087e8f1f9419a149366b68a"
integrity sha512-8zGqypfENjCIqGhgXToC8aB2r7YrBX+AQAfIPs/Mlk+BtPTztOvTS01NRW/3Eh60J+a48lt8qsCzirQ6loCVfA==
dependencies:
bytes "3.1.2"
http-errors "2.0.0"
@ -12406,9 +12406,9 @@ webpack-bundle-analyzer@^4.5.0:
ws "^7.3.1"
webpack-dev-middleware@^5.3.1:
version "5.3.3"
resolved "https://registry.npmjs.org/webpack-dev-middleware/-/webpack-dev-middleware-5.3.3.tgz"
integrity sha512-hj5CYrY0bZLB+eTO+x/j67Pkrquiy7kWepMHmUMoPsmcUaeEnQJqFzHJOyxgWlq746/wUuA64p9ta34Kyb01pA==
version "5.3.4"
resolved "https://registry.yarnpkg.com/webpack-dev-middleware/-/webpack-dev-middleware-5.3.4.tgz#eb7b39281cbce10e104eb2b8bf2b63fce49a3517"
integrity sha512-BVdTqhhs+0IfoeAf7EoH5WE+exCmqGerHfDM0IL096Px60Tq2Mn9MAbnaGUe6HiMa41KMCYF19gyzZmBcq/o4Q==
dependencies:
colorette "^2.0.10"
memfs "^3.4.3"

View file

@ -96,6 +96,8 @@ class _ENTERPRISE_GoogleTextModeration(CustomLogger):
async def async_moderation_hook(
self,
data: dict,
user_api_key_dict: UserAPIKeyAuth,
call_type: Literal["completion", "embeddings", "image_generation"],
):
"""
- Calls Google's Text Moderation API

View file

@ -99,6 +99,8 @@ class _ENTERPRISE_LlamaGuard(CustomLogger):
async def async_moderation_hook(
self,
data: dict,
user_api_key_dict: UserAPIKeyAuth,
call_type: Literal["completion", "embeddings", "image_generation"],
):
"""
- Calls the Llama Guard Endpoint

View file

@ -22,6 +22,7 @@ from litellm.utils import (
)
from datetime import datetime
import aiohttp, asyncio
from litellm.utils import get_formatted_prompt
litellm.set_verbose = True
@ -29,9 +30,12 @@ litellm.set_verbose = True
class _ENTERPRISE_LLMGuard(CustomLogger):
# Class variables or attributes
def __init__(
self, mock_testing: bool = False, mock_redacted_text: Optional[dict] = None
self,
mock_testing: bool = False,
mock_redacted_text: Optional[dict] = None,
):
self.mock_redacted_text = mock_redacted_text
self.llm_guard_mode = litellm.llm_guard_mode
if mock_testing == True: # for testing purposes only
return
self.llm_guard_api_base = litellm.get_secret("LLM_GUARD_API_BASE", None)
@ -59,7 +63,7 @@ class _ENTERPRISE_LLMGuard(CustomLogger):
else:
# Make the first request to /analyze
analyze_url = f"{self.llm_guard_api_base}analyze/prompt"
verbose_proxy_logger.debug(f"Making request to: {analyze_url}")
verbose_proxy_logger.debug("Making request to: %s", analyze_url)
analyze_payload = {"prompt": text}
redacted_text = None
async with session.post(
@ -72,7 +76,7 @@ class _ENTERPRISE_LLMGuard(CustomLogger):
if redacted_text is not None:
if (
redacted_text.get("is_valid", None) is not None
and redacted_text["is_valid"] == "True"
and redacted_text["is_valid"] != True
):
raise HTTPException(
status_code=400,
@ -91,9 +95,26 @@ class _ENTERPRISE_LLMGuard(CustomLogger):
traceback.print_exc()
raise e
def should_proceed(self, user_api_key_dict: UserAPIKeyAuth) -> bool:
if self.llm_guard_mode == "key-specific":
# check if llm guard enabled for specific keys only
self.print_verbose(
f"user_api_key_dict.permissions: {user_api_key_dict.permissions}"
)
if (
user_api_key_dict.permissions.get("enable_llm_guard_check", False)
== True
):
return True
elif self.llm_guard_mode == "all":
return True
return False
async def async_moderation_hook(
self,
data: dict,
user_api_key_dict: UserAPIKeyAuth,
call_type: Literal["completion", "embeddings", "image_generation"],
):
"""
- Calls the LLM Guard Endpoint
@ -101,7 +122,32 @@ class _ENTERPRISE_LLMGuard(CustomLogger):
- Use the sanitized prompt returned
- LLM Guard can handle things like PII Masking, etc.
"""
return data
self.print_verbose(
f"Inside LLM Guard Pre-Call Hook - llm_guard_mode={self.llm_guard_mode}"
)
_proceed = self.should_proceed(user_api_key_dict=user_api_key_dict)
if _proceed == False:
return
self.print_verbose("Makes LLM Guard Check")
try:
assert call_type in [
"completion",
"embeddings",
"image_generation",
"moderation",
"audio_transcription",
]
except Exception as e:
self.print_verbose(
f"Call Type - {call_type}, not in accepted list - ['completion','embeddings','image_generation','moderation','audio_transcription']"
)
return data
formatted_prompt = get_formatted_prompt(data=data, call_type=call_type) # type: ignore
self.print_verbose(f"LLM Guard, formatted_prompt: {formatted_prompt}")
return await self.moderation_check(text=formatted_prompt)
async def async_post_call_streaming_hook(
self, user_api_key_dict: UserAPIKeyAuth, response: str

View file

@ -0,0 +1,8 @@
```
npm install
npm run dev
```
```
npm run deploy
```

View file

@ -0,0 +1,14 @@
{
"scripts": {
"dev": "wrangler dev src/index.ts",
"deploy": "wrangler deploy --minify src/index.ts"
},
"dependencies": {
"hono": "^4.1.4",
"openai": "^4.29.2"
},
"devDependencies": {
"@cloudflare/workers-types": "^4.20240208.0",
"wrangler": "^3.32.0"
}
}

View file

@ -0,0 +1,59 @@
import { Hono } from 'hono'
import { Context } from 'hono';
import { bearerAuth } from 'hono/bearer-auth'
import OpenAI from "openai";
const openai = new OpenAI({
apiKey: "sk-1234",
baseURL: "https://openai-endpoint.ishaanjaffer0324.workers.dev"
});
async function call_proxy() {
const completion = await openai.chat.completions.create({
messages: [{ role: "system", content: "You are a helpful assistant." }],
model: "gpt-3.5-turbo",
});
return completion
}
const app = new Hono()
// Middleware for API Key Authentication
const apiKeyAuth = async (c: Context, next: Function) => {
const apiKey = c.req.header('Authorization');
if (!apiKey || apiKey !== 'Bearer sk-1234') {
return c.text('Unauthorized', 401);
}
await next();
};
app.use('/*', apiKeyAuth)
app.get('/', (c) => {
return c.text('Hello Hono!')
})
// Handler for chat completions
const chatCompletionHandler = async (c: Context) => {
// Assuming your logic for handling chat completion goes here
// For demonstration, just returning a simple JSON response
const response = await call_proxy()
return c.json(response);
};
// Register the above handler for different POST routes with the apiKeyAuth middleware
app.post('/v1/chat/completions', chatCompletionHandler);
app.post('/chat/completions', chatCompletionHandler);
// Example showing how you might handle dynamic segments within the URL
// Here, using ':model*' to capture the rest of the path as a parameter 'model'
app.post('/openai/deployments/:model*/chat/completions', chatCompletionHandler);
export default app

View file

@ -0,0 +1,16 @@
{
"compilerOptions": {
"target": "ESNext",
"module": "ESNext",
"moduleResolution": "Bundler",
"strict": true,
"lib": [
"ESNext"
],
"types": [
"@cloudflare/workers-types"
],
"jsx": "react-jsx",
"jsxImportSource": "hono/jsx"
},
}

View file

@ -0,0 +1,18 @@
name = "my-app"
compatibility_date = "2023-12-01"
# [vars]
# MY_VAR = "my-variable"
# [[kv_namespaces]]
# binding = "MY_KV_NAMESPACE"
# id = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
# [[r2_buckets]]
# binding = "MY_BUCKET"
# bucket_name = "my-bucket"
# [[d1_databases]]
# binding = "DB"
# database_name = "my-database"
# database_id = ""

View file

@ -0,0 +1,26 @@
# Use the specific Node.js v20.11.0 image
FROM node:20.11.0
# Set the working directory inside the container
WORKDIR /app
# Copy package.json and package-lock.json to the working directory
COPY ./litellm-js/spend-logs/package*.json ./
# Install dependencies
RUN npm install
# Install Prisma globally
RUN npm install -g prisma
# Copy the rest of the application code
COPY ./litellm-js/spend-logs .
# Generate Prisma client
RUN npx prisma generate
# Expose the port that the Node.js server will run on
EXPOSE 3000
# Command to run the Node.js app with npm run dev
CMD ["npm", "run", "dev"]

View file

@ -0,0 +1,8 @@
```
npm install
npm run dev
```
```
open http://localhost:3000
```

508
litellm-js/spend-logs/package-lock.json generated Normal file
View file

@ -0,0 +1,508 @@
{
"name": "spend-logs",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"dependencies": {
"@hono/node-server": "^1.9.0",
"hono": "^4.1.5"
},
"devDependencies": {
"@types/node": "^20.11.17",
"tsx": "^4.7.1"
}
},
"node_modules/@esbuild/aix-ppc64": {
"version": "0.19.12",
"resolved": "https://registry.npmjs.org/@esbuild/aix-ppc64/-/aix-ppc64-0.19.12.tgz",
"integrity": "sha512-bmoCYyWdEL3wDQIVbcyzRyeKLgk2WtWLTWz1ZIAZF/EGbNOwSA6ew3PftJ1PqMiOOGu0OyFMzG53L0zqIpPeNA==",
"cpu": [
"ppc64"
],
"dev": true,
"optional": true,
"os": [
"aix"
],
"engines": {
"node": ">=12"
}
},
"node_modules/@esbuild/android-arm": {
"version": "0.19.12",
"resolved": "https://registry.npmjs.org/@esbuild/android-arm/-/android-arm-0.19.12.tgz",
"integrity": "sha512-qg/Lj1mu3CdQlDEEiWrlC4eaPZ1KztwGJ9B6J+/6G+/4ewxJg7gqj8eVYWvao1bXrqGiW2rsBZFSX3q2lcW05w==",
"cpu": [
"arm"
],
"dev": true,
"optional": true,
"os": [
"android"
],
"engines": {
"node": ">=12"
}
},
"node_modules/@esbuild/android-arm64": {
"version": "0.19.12",
"resolved": "https://registry.npmjs.org/@esbuild/android-arm64/-/android-arm64-0.19.12.tgz",
"integrity": "sha512-P0UVNGIienjZv3f5zq0DP3Nt2IE/3plFzuaS96vihvD0Hd6H/q4WXUGpCxD/E8YrSXfNyRPbpTq+T8ZQioSuPA==",
"cpu": [
"arm64"
],
"dev": true,
"optional": true,
"os": [
"android"
],
"engines": {
"node": ">=12"
}
},
"node_modules/@esbuild/android-x64": {
"version": "0.19.12",
"resolved": "https://registry.npmjs.org/@esbuild/android-x64/-/android-x64-0.19.12.tgz",
"integrity": "sha512-3k7ZoUW6Q6YqhdhIaq/WZ7HwBpnFBlW905Fa4s4qWJyiNOgT1dOqDiVAQFwBH7gBRZr17gLrlFCRzF6jFh7Kew==",
"cpu": [
"x64"
],
"dev": true,
"optional": true,
"os": [
"android"
],
"engines": {
"node": ">=12"
}
},
"node_modules/@esbuild/darwin-arm64": {
"version": "0.19.12",
"resolved": "https://registry.npmjs.org/@esbuild/darwin-arm64/-/darwin-arm64-0.19.12.tgz",
"integrity": "sha512-B6IeSgZgtEzGC42jsI+YYu9Z3HKRxp8ZT3cqhvliEHovq8HSX2YX8lNocDn79gCKJXOSaEot9MVYky7AKjCs8g==",
"cpu": [
"arm64"
],
"dev": true,
"optional": true,
"os": [
"darwin"
],
"engines": {
"node": ">=12"
}
},
"node_modules/@esbuild/darwin-x64": {
"version": "0.19.12",
"resolved": "https://registry.npmjs.org/@esbuild/darwin-x64/-/darwin-x64-0.19.12.tgz",
"integrity": "sha512-hKoVkKzFiToTgn+41qGhsUJXFlIjxI/jSYeZf3ugemDYZldIXIxhvwN6erJGlX4t5h417iFuheZ7l+YVn05N3A==",
"cpu": [
"x64"
],
"dev": true,
"optional": true,
"os": [
"darwin"
],
"engines": {
"node": ">=12"
}
},
"node_modules/@esbuild/freebsd-arm64": {
"version": "0.19.12",
"resolved": "https://registry.npmjs.org/@esbuild/freebsd-arm64/-/freebsd-arm64-0.19.12.tgz",
"integrity": "sha512-4aRvFIXmwAcDBw9AueDQ2YnGmz5L6obe5kmPT8Vd+/+x/JMVKCgdcRwH6APrbpNXsPz+K653Qg8HB/oXvXVukA==",
"cpu": [
"arm64"
],
"dev": true,
"optional": true,
"os": [
"freebsd"
],
"engines": {
"node": ">=12"
}
},
"node_modules/@esbuild/freebsd-x64": {
"version": "0.19.12",
"resolved": "https://registry.npmjs.org/@esbuild/freebsd-x64/-/freebsd-x64-0.19.12.tgz",
"integrity": "sha512-EYoXZ4d8xtBoVN7CEwWY2IN4ho76xjYXqSXMNccFSx2lgqOG/1TBPW0yPx1bJZk94qu3tX0fycJeeQsKovA8gg==",
"cpu": [
"x64"
],
"dev": true,
"optional": true,
"os": [
"freebsd"
],
"engines": {
"node": ">=12"
}
},
"node_modules/@esbuild/linux-arm": {
"version": "0.19.12",
"resolved": "https://registry.npmjs.org/@esbuild/linux-arm/-/linux-arm-0.19.12.tgz",
"integrity": "sha512-J5jPms//KhSNv+LO1S1TX1UWp1ucM6N6XuL6ITdKWElCu8wXP72l9MM0zDTzzeikVyqFE6U8YAV9/tFyj0ti+w==",
"cpu": [
"arm"
],
"dev": true,
"optional": true,
"os": [
"linux"
],
"engines": {
"node": ">=12"
}
},
"node_modules/@esbuild/linux-arm64": {
"version": "0.19.12",
"resolved": "https://registry.npmjs.org/@esbuild/linux-arm64/-/linux-arm64-0.19.12.tgz",
"integrity": "sha512-EoTjyYyLuVPfdPLsGVVVC8a0p1BFFvtpQDB/YLEhaXyf/5bczaGeN15QkR+O4S5LeJ92Tqotve7i1jn35qwvdA==",
"cpu": [
"arm64"
],
"dev": true,
"optional": true,
"os": [
"linux"
],
"engines": {
"node": ">=12"
}
},
"node_modules/@esbuild/linux-ia32": {
"version": "0.19.12",
"resolved": "https://registry.npmjs.org/@esbuild/linux-ia32/-/linux-ia32-0.19.12.tgz",
"integrity": "sha512-Thsa42rrP1+UIGaWz47uydHSBOgTUnwBwNq59khgIwktK6x60Hivfbux9iNR0eHCHzOLjLMLfUMLCypBkZXMHA==",
"cpu": [
"ia32"
],
"dev": true,
"optional": true,
"os": [
"linux"
],
"engines": {
"node": ">=12"
}
},
"node_modules/@esbuild/linux-loong64": {
"version": "0.19.12",
"resolved": "https://registry.npmjs.org/@esbuild/linux-loong64/-/linux-loong64-0.19.12.tgz",
"integrity": "sha512-LiXdXA0s3IqRRjm6rV6XaWATScKAXjI4R4LoDlvO7+yQqFdlr1Bax62sRwkVvRIrwXxvtYEHHI4dm50jAXkuAA==",
"cpu": [
"loong64"
],
"dev": true,
"optional": true,
"os": [
"linux"
],
"engines": {
"node": ">=12"
}
},
"node_modules/@esbuild/linux-mips64el": {
"version": "0.19.12",
"resolved": "https://registry.npmjs.org/@esbuild/linux-mips64el/-/linux-mips64el-0.19.12.tgz",
"integrity": "sha512-fEnAuj5VGTanfJ07ff0gOA6IPsvrVHLVb6Lyd1g2/ed67oU1eFzL0r9WL7ZzscD+/N6i3dWumGE1Un4f7Amf+w==",
"cpu": [
"mips64el"
],
"dev": true,
"optional": true,
"os": [
"linux"
],
"engines": {
"node": ">=12"
}
},
"node_modules/@esbuild/linux-ppc64": {
"version": "0.19.12",
"resolved": "https://registry.npmjs.org/@esbuild/linux-ppc64/-/linux-ppc64-0.19.12.tgz",
"integrity": "sha512-nYJA2/QPimDQOh1rKWedNOe3Gfc8PabU7HT3iXWtNUbRzXS9+vgB0Fjaqr//XNbd82mCxHzik2qotuI89cfixg==",
"cpu": [
"ppc64"
],
"dev": true,
"optional": true,
"os": [
"linux"
],
"engines": {
"node": ">=12"
}
},
"node_modules/@esbuild/linux-riscv64": {
"version": "0.19.12",
"resolved": "https://registry.npmjs.org/@esbuild/linux-riscv64/-/linux-riscv64-0.19.12.tgz",
"integrity": "sha512-2MueBrlPQCw5dVJJpQdUYgeqIzDQgw3QtiAHUC4RBz9FXPrskyyU3VI1hw7C0BSKB9OduwSJ79FTCqtGMWqJHg==",
"cpu": [
"riscv64"
],
"dev": true,
"optional": true,
"os": [
"linux"
],
"engines": {
"node": ">=12"
}
},
"node_modules/@esbuild/linux-s390x": {
"version": "0.19.12",
"resolved": "https://registry.npmjs.org/@esbuild/linux-s390x/-/linux-s390x-0.19.12.tgz",
"integrity": "sha512-+Pil1Nv3Umes4m3AZKqA2anfhJiVmNCYkPchwFJNEJN5QxmTs1uzyy4TvmDrCRNT2ApwSari7ZIgrPeUx4UZDg==",
"cpu": [
"s390x"
],
"dev": true,
"optional": true,
"os": [
"linux"
],
"engines": {
"node": ">=12"
}
},
"node_modules/@esbuild/linux-x64": {
"version": "0.19.12",
"resolved": "https://registry.npmjs.org/@esbuild/linux-x64/-/linux-x64-0.19.12.tgz",
"integrity": "sha512-B71g1QpxfwBvNrfyJdVDexenDIt1CiDN1TIXLbhOw0KhJzE78KIFGX6OJ9MrtC0oOqMWf+0xop4qEU8JrJTwCg==",
"cpu": [
"x64"
],
"dev": true,
"optional": true,
"os": [
"linux"
],
"engines": {
"node": ">=12"
}
},
"node_modules/@esbuild/netbsd-x64": {
"version": "0.19.12",
"resolved": "https://registry.npmjs.org/@esbuild/netbsd-x64/-/netbsd-x64-0.19.12.tgz",
"integrity": "sha512-3ltjQ7n1owJgFbuC61Oj++XhtzmymoCihNFgT84UAmJnxJfm4sYCiSLTXZtE00VWYpPMYc+ZQmB6xbSdVh0JWA==",
"cpu": [
"x64"
],
"dev": true,
"optional": true,
"os": [
"netbsd"
],
"engines": {
"node": ">=12"
}
},
"node_modules/@esbuild/openbsd-x64": {
"version": "0.19.12",
"resolved": "https://registry.npmjs.org/@esbuild/openbsd-x64/-/openbsd-x64-0.19.12.tgz",
"integrity": "sha512-RbrfTB9SWsr0kWmb9srfF+L933uMDdu9BIzdA7os2t0TXhCRjrQyCeOt6wVxr79CKD4c+p+YhCj31HBkYcXebw==",
"cpu": [
"x64"
],
"dev": true,
"optional": true,
"os": [
"openbsd"
],
"engines": {
"node": ">=12"
}
},
"node_modules/@esbuild/sunos-x64": {
"version": "0.19.12",
"resolved": "https://registry.npmjs.org/@esbuild/sunos-x64/-/sunos-x64-0.19.12.tgz",
"integrity": "sha512-HKjJwRrW8uWtCQnQOz9qcU3mUZhTUQvi56Q8DPTLLB+DawoiQdjsYq+j+D3s9I8VFtDr+F9CjgXKKC4ss89IeA==",
"cpu": [
"x64"
],
"dev": true,
"optional": true,
"os": [
"sunos"
],
"engines": {
"node": ">=12"
}
},
"node_modules/@esbuild/win32-arm64": {
"version": "0.19.12",
"resolved": "https://registry.npmjs.org/@esbuild/win32-arm64/-/win32-arm64-0.19.12.tgz",
"integrity": "sha512-URgtR1dJnmGvX864pn1B2YUYNzjmXkuJOIqG2HdU62MVS4EHpU2946OZoTMnRUHklGtJdJZ33QfzdjGACXhn1A==",
"cpu": [
"arm64"
],
"dev": true,
"optional": true,
"os": [
"win32"
],
"engines": {
"node": ">=12"
}
},
"node_modules/@esbuild/win32-ia32": {
"version": "0.19.12",
"resolved": "https://registry.npmjs.org/@esbuild/win32-ia32/-/win32-ia32-0.19.12.tgz",
"integrity": "sha512-+ZOE6pUkMOJfmxmBZElNOx72NKpIa/HFOMGzu8fqzQJ5kgf6aTGrcJaFsNiVMH4JKpMipyK+7k0n2UXN7a8YKQ==",
"cpu": [
"ia32"
],
"dev": true,
"optional": true,
"os": [
"win32"
],
"engines": {
"node": ">=12"
}
},
"node_modules/@esbuild/win32-x64": {
"version": "0.19.12",
"resolved": "https://registry.npmjs.org/@esbuild/win32-x64/-/win32-x64-0.19.12.tgz",
"integrity": "sha512-T1QyPSDCyMXaO3pzBkF96E8xMkiRYbUEZADd29SyPGabqxMViNoii+NcK7eWJAEoU6RZyEm5lVSIjTmcdoB9HA==",
"cpu": [
"x64"
],
"dev": true,
"optional": true,
"os": [
"win32"
],
"engines": {
"node": ">=12"
}
},
"node_modules/@hono/node-server": {
"version": "1.9.0",
"resolved": "https://registry.npmjs.org/@hono/node-server/-/node-server-1.9.0.tgz",
"integrity": "sha512-oJjk7WXBlENeHhWiMqSyxPIZ3Kmf5ZYxqdlcSIXyN8Rn50bNJsPl99G4POBS03Jxh56FdfRJ0SEnC8mAVIiavQ==",
"engines": {
"node": ">=18.14.1"
}
},
"node_modules/@types/node": {
"version": "20.11.30",
"resolved": "https://registry.npmjs.org/@types/node/-/node-20.11.30.tgz",
"integrity": "sha512-dHM6ZxwlmuZaRmUPfv1p+KrdD1Dci04FbdEm/9wEMouFqxYoFl5aMkt0VMAUtYRQDyYvD41WJLukhq/ha3YuTw==",
"dev": true,
"dependencies": {
"undici-types": "~5.26.4"
}
},
"node_modules/esbuild": {
"version": "0.19.12",
"resolved": "https://registry.npmjs.org/esbuild/-/esbuild-0.19.12.tgz",
"integrity": "sha512-aARqgq8roFBj054KvQr5f1sFu0D65G+miZRCuJyJ0G13Zwx7vRar5Zhn2tkQNzIXcBrNVsv/8stehpj+GAjgbg==",
"dev": true,
"hasInstallScript": true,
"bin": {
"esbuild": "bin/esbuild"
},
"engines": {
"node": ">=12"
},
"optionalDependencies": {
"@esbuild/aix-ppc64": "0.19.12",
"@esbuild/android-arm": "0.19.12",
"@esbuild/android-arm64": "0.19.12",
"@esbuild/android-x64": "0.19.12",
"@esbuild/darwin-arm64": "0.19.12",
"@esbuild/darwin-x64": "0.19.12",
"@esbuild/freebsd-arm64": "0.19.12",
"@esbuild/freebsd-x64": "0.19.12",
"@esbuild/linux-arm": "0.19.12",
"@esbuild/linux-arm64": "0.19.12",
"@esbuild/linux-ia32": "0.19.12",
"@esbuild/linux-loong64": "0.19.12",
"@esbuild/linux-mips64el": "0.19.12",
"@esbuild/linux-ppc64": "0.19.12",
"@esbuild/linux-riscv64": "0.19.12",
"@esbuild/linux-s390x": "0.19.12",
"@esbuild/linux-x64": "0.19.12",
"@esbuild/netbsd-x64": "0.19.12",
"@esbuild/openbsd-x64": "0.19.12",
"@esbuild/sunos-x64": "0.19.12",
"@esbuild/win32-arm64": "0.19.12",
"@esbuild/win32-ia32": "0.19.12",
"@esbuild/win32-x64": "0.19.12"
}
},
"node_modules/fsevents": {
"version": "2.3.3",
"resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.3.tgz",
"integrity": "sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==",
"dev": true,
"hasInstallScript": true,
"optional": true,
"os": [
"darwin"
],
"engines": {
"node": "^8.16.0 || ^10.6.0 || >=11.0.0"
}
},
"node_modules/get-tsconfig": {
"version": "4.7.3",
"resolved": "https://registry.npmjs.org/get-tsconfig/-/get-tsconfig-4.7.3.tgz",
"integrity": "sha512-ZvkrzoUA0PQZM6fy6+/Hce561s+faD1rsNwhnO5FelNjyy7EMGJ3Rz1AQ8GYDWjhRs/7dBLOEJvhK8MiEJOAFg==",
"dev": true,
"dependencies": {
"resolve-pkg-maps": "^1.0.0"
},
"funding": {
"url": "https://github.com/privatenumber/get-tsconfig?sponsor=1"
}
},
"node_modules/hono": {
"version": "4.1.5",
"resolved": "https://registry.npmjs.org/hono/-/hono-4.1.5.tgz",
"integrity": "sha512-3ChJiIoeCxvkt6vnkxJagplrt1YZg3NyNob7ssVeK2PUqEINp4q1F94HzFnvY9QE8asVmbW5kkTDlyWylfg2vg==",
"engines": {
"node": ">=16.0.0"
}
},
"node_modules/resolve-pkg-maps": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/resolve-pkg-maps/-/resolve-pkg-maps-1.0.0.tgz",
"integrity": "sha512-seS2Tj26TBVOC2NIc2rOe2y2ZO7efxITtLZcGSOnHHNOQ7CkiUBfw0Iw2ck6xkIhPwLhKNLS8BO+hEpngQlqzw==",
"dev": true,
"funding": {
"url": "https://github.com/privatenumber/resolve-pkg-maps?sponsor=1"
}
},
"node_modules/tsx": {
"version": "4.7.1",
"resolved": "https://registry.npmjs.org/tsx/-/tsx-4.7.1.tgz",
"integrity": "sha512-8d6VuibXHtlN5E3zFkgY8u4DX7Y3Z27zvvPKVmLon/D4AjuKzarkUBTLDBgj9iTQ0hg5xM7c/mYiRVM+HETf0g==",
"dev": true,
"dependencies": {
"esbuild": "~0.19.10",
"get-tsconfig": "^4.7.2"
},
"bin": {
"tsx": "dist/cli.mjs"
},
"engines": {
"node": ">=18.0.0"
},
"optionalDependencies": {
"fsevents": "~2.3.3"
}
},
"node_modules/undici-types": {
"version": "5.26.5",
"resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz",
"integrity": "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==",
"dev": true
}
}
}

View file

@ -0,0 +1,13 @@
{
"scripts": {
"dev": "tsx watch src/index.ts"
},
"dependencies": {
"@hono/node-server": "^1.9.0",
"hono": "^4.1.5"
},
"devDependencies": {
"@types/node": "^20.11.17",
"tsx": "^4.7.1"
}
}

View file

@ -0,0 +1,29 @@
generator client {
provider = "prisma-client-js"
}
datasource client {
provider = "postgresql"
url = env("DATABASE_URL")
}
model LiteLLM_SpendLogs {
request_id String @id
call_type String
api_key String @default("")
spend Float @default(0.0)
total_tokens Int @default(0)
prompt_tokens Int @default(0)
completion_tokens Int @default(0)
startTime DateTime
endTime DateTime
model String @default("")
api_base String @default("")
user String @default("")
metadata Json @default("{}")
cache_hit String @default("")
cache_key String @default("")
request_tags Json @default("[]")
team_id String?
end_user String?
}

View file

@ -0,0 +1,32 @@
export type LiteLLM_IncrementSpend = {
key_transactions: Array<LiteLLM_IncrementObject>, // [{"key": spend},..]
user_transactions: Array<LiteLLM_IncrementObject>,
team_transactions: Array<LiteLLM_IncrementObject>,
spend_logs_transactions: Array<LiteLLM_SpendLogs>
}
export type LiteLLM_IncrementObject = {
key: string,
spend: number
}
export type LiteLLM_SpendLogs = {
request_id: string; // @id means it's a unique identifier
call_type: string;
api_key: string; // @default("") means it defaults to an empty string if not provided
spend: number; // Float in Prisma corresponds to number in TypeScript
total_tokens: number; // Int in Prisma corresponds to number in TypeScript
prompt_tokens: number;
completion_tokens: number;
startTime: Date; // DateTime in Prisma corresponds to Date in TypeScript
endTime: Date;
model: string; // @default("") means it defaults to an empty string if not provided
api_base: string;
user: string;
metadata: any; // Json type in Prisma is represented by any in TypeScript; could also use a more specific type if the structure of JSON is known
cache_hit: string;
cache_key: string;
request_tags: any; // Similarly, this could be an array or a more specific type depending on the expected structure
team_id?: string | null; // ? indicates it's optional and can be undefined, but could also be null if not provided
end_user?: string | null;
};

View file

@ -0,0 +1,84 @@
import { serve } from '@hono/node-server'
import { Hono } from 'hono'
import { PrismaClient } from '@prisma/client'
import {LiteLLM_SpendLogs, LiteLLM_IncrementSpend, LiteLLM_IncrementObject} from './_types'
const app = new Hono()
const prisma = new PrismaClient()
// In-memory storage for logs
let spend_logs: LiteLLM_SpendLogs[] = [];
const key_logs: LiteLLM_IncrementObject[] = [];
const user_logs: LiteLLM_IncrementObject[] = [];
const transaction_logs: LiteLLM_IncrementObject[] = [];
app.get('/', (c) => {
return c.text('Hello Hono!')
})
const MIN_LOGS = 1; // Minimum number of logs needed to initiate a flush
const FLUSH_INTERVAL = 5000; // Time in ms to wait before trying to flush again
const BATCH_SIZE = 100; // Preferred size of each batch to write to the database
const MAX_LOGS_PER_INTERVAL = 1000; // Maximum number of logs to flush in a single interval
const flushLogsToDb = async () => {
if (spend_logs.length >= MIN_LOGS) {
// Limit the logs to process in this interval to MAX_LOGS_PER_INTERVAL or less
const logsToProcess = spend_logs.slice(0, MAX_LOGS_PER_INTERVAL);
for (let i = 0; i < logsToProcess.length; i += BATCH_SIZE) {
// Create subarray for current batch, ensuring it doesn't exceed the BATCH_SIZE
const batch = logsToProcess.slice(i, i + BATCH_SIZE);
// Convert datetime strings to Date objects
const batchWithDates = batch.map(entry => ({
...entry,
startTime: new Date(entry.startTime),
endTime: new Date(entry.endTime),
// Repeat for any other DateTime fields you may have
}));
await prisma.liteLLM_SpendLogs.createMany({
data: batchWithDates,
});
console.log(`Flushed ${batch.length} logs to the DB.`);
}
// Remove the processed logs from spend_logs
spend_logs = spend_logs.slice(logsToProcess.length);
console.log(`${logsToProcess.length} logs processed. Remaining in queue: ${spend_logs.length}`);
} else {
// This will ensure it doesn't falsely claim "No logs to flush." when it's merely below the MIN_LOGS threshold.
if(spend_logs.length > 0) {
console.log(`Accumulating logs. Currently at ${spend_logs.length}, waiting for at least ${MIN_LOGS}.`);
} else {
console.log("No logs to flush.");
}
}
};
// Setup interval for attempting to flush the logs
setInterval(flushLogsToDb, FLUSH_INTERVAL);
// Route to receive log messages
app.post('/spend/update', async (c) => {
const incomingLogs = await c.req.json<LiteLLM_SpendLogs[]>();
spend_logs.push(...incomingLogs);
console.log(`Received and stored ${incomingLogs.length} logs. Total logs in memory: ${spend_logs.length}`);
return c.json({ message: `Successfully stored ${incomingLogs.length} logs` });
});
const port = 3000
console.log(`Server is running on port ${port}`)
serve({
fetch: app.fetch,
port
})

View file

@ -0,0 +1,13 @@
{
"compilerOptions": {
"target": "ESNext",
"module": "ESNext",
"moduleResolution": "Bundler",
"strict": true,
"types": [
"node"
],
"jsx": "react-jsx",
"jsxImportSource": "hono/jsx",
}
}

View file

@ -1,6 +1,6 @@
### INIT VARIABLES ###
import threading, requests, os
from typing import Callable, List, Optional, Dict, Union, Any
from typing import Callable, List, Optional, Dict, Union, Any, Literal
from litellm.caching import Cache
from litellm._logging import set_verbose, _turn_on_debug, verbose_logger
from litellm.proxy._types import KeyManagementSystem, KeyManagementSettings
@ -56,6 +56,7 @@ baseten_key: Optional[str] = None
aleph_alpha_key: Optional[str] = None
nlp_cloud_key: Optional[str] = None
use_client: bool = False
disable_streaming_logging: bool = False
### GUARDRAILS ###
llamaguard_model_name: Optional[str] = None
presidio_ad_hoc_recognizers: Optional[str] = None
@ -63,6 +64,7 @@ google_moderation_confidence_threshold: Optional[float] = None
llamaguard_unsafe_content_categories: Optional[str] = None
blocked_user_list: Optional[Union[str, List]] = None
banned_keywords_list: Optional[Union[str, List]] = None
llm_guard_mode: Literal["all", "key-specific"] = "all"
##################
logging: bool = True
caching: bool = (
@ -172,6 +174,7 @@ upperbound_key_generate_params: Optional[Dict] = None
default_user_params: Optional[Dict] = None
default_team_settings: Optional[List] = None
max_user_budget: Optional[float] = None
max_end_user_budget: Optional[float] = None
#### RELIABILITY ####
request_timeout: Optional[float] = 6000
num_retries: Optional[int] = None # per model endpoint

View file

@ -38,6 +38,9 @@ class BaseCache:
async def async_get_cache(self, key, **kwargs):
raise NotImplementedError
async def batch_cache_write(self, result, *args, **kwargs):
raise NotImplementedError
async def disconnect(self):
raise NotImplementedError
@ -96,7 +99,9 @@ class InMemoryCache(BaseCache):
class RedisCache(BaseCache):
# if users don't provider one, use the default litellm cache
def __init__(self, host=None, port=None, password=None, **kwargs):
def __init__(
self, host=None, port=None, password=None, redis_flush_size=100, **kwargs
):
from ._redis import get_redis_client, get_redis_connection_pool
redis_kwargs = {}
@ -111,6 +116,10 @@ class RedisCache(BaseCache):
self.redis_client = get_redis_client(**redis_kwargs)
self.redis_kwargs = redis_kwargs
self.async_redis_conn_pool = get_redis_connection_pool(**redis_kwargs)
# for high traffic, we store the redis results in memory and then batch write to redis
self.redis_batch_writing_buffer = []
self.redis_flush_size = redis_flush_size
self.redis_version = "Unknown"
try:
self.redis_version = self.redis_client.info()["redis_version"]
@ -161,8 +170,10 @@ class RedisCache(BaseCache):
)
except Exception as e:
# NON blocking - notify users Redis is throwing an exception
print_verbose(
f"LiteLLM Redis Caching: async set() - Got exception from REDIS : {str(e)}"
verbose_logger.error(
"LiteLLM Redis Caching: async set() - Got exception from REDIS %s, Writing value=%s",
str(e),
value,
)
traceback.print_exc()
@ -191,7 +202,27 @@ class RedisCache(BaseCache):
# Optionally, you could process 'results' to make sure that all set operations were successful.
return results
except Exception as e:
print_verbose(f"Error occurred in pipeline write - {str(e)}")
verbose_logger.error(
"LiteLLM Redis Caching: async set_cache_pipeline() - Got exception from REDIS %s, Writing value=%s",
str(e),
cache_value,
)
traceback.print_exc()
async def batch_cache_write(self, key, value, **kwargs):
print_verbose(
f"in batch cache writing for redis buffer size={len(self.redis_batch_writing_buffer)}",
)
self.redis_batch_writing_buffer.append((key, value))
if len(self.redis_batch_writing_buffer) >= self.redis_flush_size:
await self.flush_cache_buffer()
async def flush_cache_buffer(self):
print_verbose(
f"flushing to redis....reached size of buffer {len(self.redis_batch_writing_buffer)}"
)
await self.async_set_cache_pipeline(self.redis_batch_writing_buffer)
self.redis_batch_writing_buffer = []
def _get_cache_logic(self, cached_response: Any):
"""
@ -287,6 +318,9 @@ class RedisCache(BaseCache):
def flush_cache(self):
self.redis_client.flushall()
def flushall(self):
self.redis_client.flushall()
async def disconnect(self):
await self.async_redis_conn_pool.disconnect(inuse_connections=True)
@ -874,6 +908,7 @@ class Cache:
port: Optional[str] = None,
password: Optional[str] = None,
namespace: Optional[str] = None,
ttl: Optional[float] = None,
similarity_threshold: Optional[float] = None,
supported_call_types: Optional[
List[
@ -908,6 +943,7 @@ class Cache:
s3_path: Optional[str] = None,
redis_semantic_cache_use_async=False,
redis_semantic_cache_embedding_model="text-embedding-ada-002",
redis_flush_size=None,
**kwargs,
):
"""
@ -930,7 +966,9 @@ class Cache:
None. Cache is set as a litellm param
"""
if type == "redis":
self.cache: BaseCache = RedisCache(host, port, password, **kwargs)
self.cache: BaseCache = RedisCache(
host, port, password, redis_flush_size, **kwargs
)
elif type == "redis-semantic":
self.cache = RedisSemanticCache(
host,
@ -967,6 +1005,8 @@ class Cache:
self.supported_call_types = supported_call_types # default to ["completion", "acompletion", "embedding", "aembedding"]
self.type = type
self.namespace = namespace
self.redis_flush_size = redis_flush_size
self.ttl = ttl
def get_cache_key(self, *args, **kwargs):
"""
@ -1206,6 +1246,9 @@ class Cache:
if isinstance(result, OpenAIObject):
result = result.model_dump_json()
## DEFAULT TTL ##
if self.ttl is not None:
kwargs["ttl"] = self.ttl
## Get Cache-Controls ##
if kwargs.get("cache", None) is not None and isinstance(
kwargs.get("cache"), dict
@ -1213,6 +1256,7 @@ class Cache:
for k, v in kwargs.get("cache").items():
if k == "ttl":
kwargs["ttl"] = v
cached_data = {"timestamp": time.time(), "response": result}
return cache_key, cached_data, kwargs
else:
@ -1246,10 +1290,14 @@ class Cache:
Async implementation of add_cache
"""
try:
cache_key, cached_data, kwargs = self._add_cache_logic(
result=result, *args, **kwargs
)
await self.cache.async_set_cache(cache_key, cached_data, **kwargs)
if self.type == "redis" and self.redis_flush_size is not None:
# high traffic - fill in results in memory and then flush
await self.batch_cache_write(result, *args, **kwargs)
else:
cache_key, cached_data, kwargs = self._add_cache_logic(
result=result, *args, **kwargs
)
await self.cache.async_set_cache(cache_key, cached_data, **kwargs)
except Exception as e:
print_verbose(f"LiteLLM Cache: Excepton add_cache: {str(e)}")
traceback.print_exc()
@ -1287,6 +1335,12 @@ class Cache:
print_verbose(f"LiteLLM Cache: Excepton add_cache: {str(e)}")
traceback.print_exc()
async def batch_cache_write(self, result, *args, **kwargs):
cache_key, cached_data, kwargs = self._add_cache_logic(
result=result, *args, **kwargs
)
await self.cache.batch_cache_write(cache_key, cached_data, **kwargs)
async def ping(self):
if hasattr(self.cache, "ping"):
return await self.cache.ping()

View file

@ -10,7 +10,7 @@ class AthinaLogger:
"Content-Type": "application/json"
}
self.athina_logging_url = "https://log.athina.ai/api/v1/log/inference"
self.additional_keys = ["environment", "prompt_slug", "customer_id", "customer_user_id", "session_id", "external_reference_id", "context", "expected_response"]
self.additional_keys = ["environment", "prompt_slug", "customer_id", "customer_user_id", "session_id", "external_reference_id", "context", "expected_response", "user_query"]
def log_event(self, kwargs, response_obj, start_time, end_time, print_verbose):
import requests
@ -32,8 +32,6 @@ class AthinaLogger:
if "messages" in kwargs:
data["prompt"] = kwargs.get("messages", None)
if kwargs.get("messages") and len(kwargs.get("messages")) > 0:
data["user_query"] = kwargs.get("messages")[0].get("content", None)
# Directly add tools or functions if present
optional_params = kwargs.get("optional_params", {})

View file

@ -72,7 +72,12 @@ class CustomLogger: # https://docs.litellm.ai/docs/observability/custom_callbac
):
pass
async def async_moderation_hook(self, data: dict):
async def async_moderation_hook(
self,
data: dict,
user_api_key_dict: UserAPIKeyAuth,
call_type: Literal["completion", "embeddings", "image_generation"],
):
pass
async def async_post_call_streaming_hook(

View file

@ -246,13 +246,13 @@ class LangFuseLogger:
metadata_tags = metadata.get("tags", [])
tags = metadata_tags
generation_name = metadata.get("generation_name", None)
if generation_name is None:
# just log `litellm-{call_type}` as the generation name
generation_name = f"litellm-{kwargs.get('call_type', 'completion')}"
trace_name = metadata.get("trace_name", None)
if trace_name is None:
# just log `litellm-{call_type}` as the trace name
trace_name = f"litellm-{kwargs.get('call_type', 'completion')}"
trace_params = {
"name": generation_name,
"name": trace_name,
"input": input,
"user_id": metadata.get("trace_user_id", user_id),
"id": metadata.get("trace_id", None),
@ -311,6 +311,11 @@ class LangFuseLogger:
"completion_tokens": response_obj["usage"]["completion_tokens"],
"total_cost": cost if supports_costs else None,
}
generation_name = metadata.get("generation_name", None)
if generation_name is None:
# just log `litellm-{call_type}` as the generation name
generation_name = f"litellm-{kwargs.get('call_type', 'completion')}"
generation_params = {
"name": generation_name,
"id": metadata.get("generation_id", generation_id),

View file

@ -131,18 +131,24 @@ def completion(
)
else:
# Separate system prompt from rest of message
system_prompt_idx: Optional[int] = None
system_prompt_indices = []
system_prompt = ""
for idx, message in enumerate(messages):
if message["role"] == "system":
optional_params["system"] = message["content"]
system_prompt_idx = idx
break
if system_prompt_idx is not None:
messages.pop(system_prompt_idx)
system_prompt += message["content"]
system_prompt_indices.append(idx)
if len(system_prompt_indices) > 0:
for idx in reversed(system_prompt_indices):
messages.pop(idx)
if len(system_prompt) > 0:
optional_params["system"] = system_prompt
# Format rest of message according to anthropic guidelines
messages = prompt_factory(
model=model, messages=messages, custom_llm_provider="anthropic"
)
try:
messages = prompt_factory(
model=model, messages=messages, custom_llm_provider="anthropic"
)
except Exception as e:
raise AnthropicError(status_code=400, message=str(e))
## Load Config
config = litellm.AnthropicConfig.get_config()
@ -295,7 +301,7 @@ def completion(
)
streaming_choice.delta = delta_obj
streaming_model_response.choices = [streaming_choice]
completion_stream = model_response_iterator(
completion_stream = ModelResponseIterator(
model_response=streaming_model_response
)
print_verbose(
@ -324,8 +330,30 @@ def completion(
return model_response
def model_response_iterator(model_response):
yield model_response
class ModelResponseIterator:
def __init__(self, model_response):
self.model_response = model_response
self.is_done = False
# Sync iterator
def __iter__(self):
return self
def __next__(self):
if self.is_done:
raise StopIteration
self.is_done = True
return self.model_response
# Async iterator
def __aiter__(self):
return self
async def __anext__(self):
if self.is_done:
raise StopAsyncIteration
self.is_done = True
return self.model_response
def embedding():

View file

@ -11,6 +11,7 @@ from .prompt_templates.factory import (
construct_tool_use_system_prompt,
extract_between_tags,
parse_xml_params,
contains_tag,
)
import httpx
@ -78,11 +79,13 @@ class AmazonTitanConfig:
class AmazonAnthropicClaude3Config:
"""
Reference: https://us-west-2.console.aws.amazon.com/bedrock/home?region=us-west-2#/providers?model=claude
Reference:
https://us-west-2.console.aws.amazon.com/bedrock/home?region=us-west-2#/providers?model=claude
https://docs.anthropic.com/claude/docs/models-overview#model-comparison
Supported Params for the Amazon / Anthropic Claude 3 models:
- `max_tokens` Required (integer) max tokens,
- `max_tokens` Required (integer) max tokens. Default is 4096
- `anthropic_version` Required (string) version of anthropic for bedrock - e.g. "bedrock-2023-05-31"
- `system` Optional (string) the system prompt, conversion from openai format to this is handled in factory.py
- `temperature` Optional (float) The amount of randomness injected into the response
@ -91,7 +94,7 @@ class AmazonAnthropicClaude3Config:
- `stop_sequences` Optional (List[str]) Custom text sequences that cause the model to stop generating
"""
max_tokens: Optional[int] = litellm.max_tokens
max_tokens: Optional[int] = 4096 # Opus, Sonnet, and Haiku default
anthropic_version: Optional[str] = "bedrock-2023-05-31"
system: Optional[str] = None
temperature: Optional[float] = None
@ -128,7 +131,15 @@ class AmazonAnthropicClaude3Config:
}
def get_supported_openai_params(self):
return ["max_tokens", "tools", "tool_choice", "stream"]
return [
"max_tokens",
"tools",
"tool_choice",
"stream",
"stop",
"temperature",
"top_p",
]
def map_openai_params(self, non_default_params: dict, optional_params: dict):
for param, value in non_default_params.items():
@ -679,6 +690,7 @@ def completion(
timeout=None,
):
exception_mapping_worked = False
_is_function_call = False
try:
# pop aws_secret_access_key, aws_access_key_id, aws_region_name from kwargs, since completion calls fail with them
aws_secret_access_key = optional_params.pop("aws_secret_access_key", None)
@ -727,8 +739,10 @@ def completion(
system_messages.append(message["content"])
system_prompt_idx.append(idx)
if len(system_prompt_idx) > 0:
inference_params["system"] = '\n'.join(system_messages)
messages = [i for j, i in enumerate(messages) if j not in system_prompt_idx]
inference_params["system"] = "\n".join(system_messages)
messages = [
i for j, i in enumerate(messages) if j not in system_prompt_idx
]
# Format rest of message according to anthropic guidelines
messages = prompt_factory(
model=model, messages=messages, custom_llm_provider="anthropic"
@ -742,6 +756,7 @@ def completion(
inference_params[k] = v
## Handle Tool Calling
if "tools" in inference_params:
_is_function_call = True
tool_calling_system_prompt = construct_tool_use_system_prompt(
tools=inference_params["tools"]
)
@ -823,7 +838,7 @@ def completion(
## COMPLETION CALL
accept = "application/json"
contentType = "application/json"
if stream == True:
if stream == True and _is_function_call == False:
if provider == "ai21":
## LOGGING
request_str = f"""
@ -918,7 +933,9 @@ def completion(
elif provider == "anthropic":
if model.startswith("anthropic.claude-3"):
outputText = response_body.get("content")[0].get("text", None)
if "<invoke>" in outputText: # OUTPUT PARSE FUNCTION CALL
if outputText is not None and contains_tag(
"invoke", outputText
): # OUTPUT PARSE FUNCTION CALL
function_name = extract_between_tags("tool_name", outputText)[0]
function_arguments_str = extract_between_tags("invoke", outputText)[
0
@ -941,6 +958,56 @@ def completion(
content=None,
)
model_response.choices[0].message = _message # type: ignore
if _is_function_call == True and stream is not None and stream == True:
print_verbose(
f"INSIDE BEDROCK STREAMING TOOL CALLING CONDITION BLOCK"
)
# return an iterator
streaming_model_response = ModelResponse(stream=True)
streaming_model_response.choices[0].finish_reason = (
model_response.choices[0].finish_reason
)
# streaming_model_response.choices = [litellm.utils.StreamingChoices()]
streaming_choice = litellm.utils.StreamingChoices()
streaming_choice.index = model_response.choices[0].index
_tool_calls = []
print_verbose(
f"type of model_response.choices[0]: {type(model_response.choices[0])}"
)
print_verbose(f"type of streaming_choice: {type(streaming_choice)}")
if isinstance(model_response.choices[0], litellm.Choices):
if getattr(
model_response.choices[0].message, "tool_calls", None
) is not None and isinstance(
model_response.choices[0].message.tool_calls, list
):
for tool_call in model_response.choices[
0
].message.tool_calls:
_tool_call = {**tool_call.dict(), "index": 0}
_tool_calls.append(_tool_call)
delta_obj = litellm.utils.Delta(
content=getattr(
model_response.choices[0].message, "content", None
),
role=model_response.choices[0].message.role,
tool_calls=_tool_calls,
)
streaming_choice.delta = delta_obj
streaming_model_response.choices = [streaming_choice]
completion_stream = model_response_iterator(
model_response=streaming_model_response
)
print_verbose(
f"Returns anthropic CustomStreamWrapper with 'cached_response' streaming object"
)
return litellm.CustomStreamWrapper(
completion_stream=completion_stream,
model=model,
custom_llm_provider="cached_response",
logging_obj=logging_obj,
)
model_response["finish_reason"] = response_body["stop_reason"]
_usage = litellm.Usage(
prompt_tokens=response_body["usage"]["input_tokens"],
@ -1029,6 +1096,10 @@ def completion(
raise BedrockError(status_code=500, message=traceback.format_exc())
async def model_response_iterator(model_response):
yield model_response
def _embedding_func_single(
model: str,
input: str,

View file

@ -0,0 +1,38 @@
from typing import Optional
import httpx
class HTTPHandler:
def __init__(self, concurrent_limit=1000):
# Create a client with a connection pool
self.client = httpx.AsyncClient(
limits=httpx.Limits(
max_connections=concurrent_limit,
max_keepalive_connections=concurrent_limit,
)
)
async def close(self):
# Close the client when you're done with it
await self.client.aclose()
async def get(
self, url: str, params: Optional[dict] = None, headers: Optional[dict] = None
):
response = await self.client.get(url, params=params, headers=headers)
return response
async def post(
self,
url: str,
data: Optional[dict] = None,
params: Optional[dict] = None,
headers: Optional[dict] = None,
):
try:
response = await self.client.post(
url, data=data, params=params, headers=headers
)
return response
except Exception as e:
raise e

View file

@ -118,7 +118,7 @@ def completion(
logger_fn=None,
):
try:
import google.generativeai as genai
import google.generativeai as genai # type: ignore
except:
raise Exception(
"Importing google.generativeai failed, please run 'pip install -q google-generativeai"
@ -308,7 +308,7 @@ async def async_completion(
messages,
encoding,
):
import google.generativeai as genai
import google.generativeai as genai # type: ignore
response = await _model.generate_content_async(
contents=prompt,

View file

@ -68,9 +68,9 @@ class OllamaConfig:
repeat_last_n: Optional[int] = None
repeat_penalty: Optional[float] = None
temperature: Optional[float] = None
stop: Optional[
list
] = None # stop is a list based on this - https://github.com/jmorganca/ollama/pull/442
stop: Optional[list] = (
None # stop is a list based on this - https://github.com/jmorganca/ollama/pull/442
)
tfs_z: Optional[float] = None
num_predict: Optional[int] = None
top_k: Optional[int] = None
@ -344,9 +344,9 @@ async def ollama_acompletion(url, data, model_response, encoding, logging_obj):
async def ollama_aembeddings(
api_base="http://localhost:11434",
model="llama2",
prompt="Why is the sky blue?",
api_base: str,
model: str,
prompts: list,
optional_params=None,
logging_obj=None,
model_response=None,
@ -365,51 +365,56 @@ async def ollama_aembeddings(
): # completion(top_k=3) > cohere_config(top_k=3) <- allows for dynamic variables to be passed in
optional_params[k] = v
data = {
"model": model,
"prompt": prompt,
}
## LOGGING
logging_obj.pre_call(
input=None,
api_key=None,
additional_args={"api_base": url, "complete_input_dict": data, "headers": {}},
)
total_input_tokens = 0
output_data = []
timeout = aiohttp.ClientTimeout(total=litellm.request_timeout) # 10 minutes
async with aiohttp.ClientSession(timeout=timeout) as session:
response = await session.post(url, json=data)
if response.status != 200:
text = await response.text()
raise OllamaError(status_code=response.status, message=text)
## LOGGING
logging_obj.post_call(
input=prompt,
api_key="",
original_response=response.text,
additional_args={
"headers": None,
"api_base": api_base,
},
)
response_json = await response.json()
embeddings = response_json["embedding"]
## RESPONSE OBJECT
output_data = []
for idx, embedding in enumerate(embeddings):
output_data.append(
{"object": "embedding", "index": idx, "embedding": embedding}
for idx, prompt in enumerate(prompts):
data = {
"model": model,
"prompt": prompt,
}
## LOGGING
logging_obj.pre_call(
input=None,
api_key=None,
additional_args={
"api_base": url,
"complete_input_dict": data,
"headers": {},
},
)
model_response["object"] = "list"
model_response["data"] = output_data
model_response["model"] = model
input_tokens = len(encoding.encode(prompt))
response = await session.post(url, json=data)
if response.status != 200:
text = await response.text()
raise OllamaError(status_code=response.status, message=text)
model_response["usage"] = {
"prompt_tokens": input_tokens,
"total_tokens": input_tokens,
}
return model_response
## LOGGING
logging_obj.post_call(
input=prompt,
api_key="",
original_response=response.text,
additional_args={
"headers": None,
"api_base": api_base,
},
)
response_json = await response.json()
embeddings: list[float] = response_json["embedding"]
output_data.append(
{"object": "embedding", "index": idx, "embedding": embeddings}
)
input_tokens = len(encoding.encode(prompt))
total_input_tokens += input_tokens
model_response["object"] = "list"
model_response["data"] = output_data
model_response["model"] = model
model_response["usage"] = {
"prompt_tokens": total_input_tokens,
"total_tokens": total_input_tokens,
}
return model_response

View file

@ -173,10 +173,11 @@ class OllamaChatConfig:
litellm.add_function_to_prompt = (
True # so that main.py adds the function call to the prompt
)
optional_params["functions_unsupported_model"] = non_default_params.pop(
optional_params["functions_unsupported_model"] = non_default_params.get(
"functions"
)
non_default_params.pop("tool_choice", None) # causes ollama requests to hang
non_default_params.pop("functions", None) # causes ollama requests to hang
return optional_params

View file

@ -98,7 +98,7 @@ def completion(
logger_fn=None,
):
try:
import google.generativeai as palm
import google.generativeai as palm # type: ignore
except:
raise Exception(
"Importing google.generativeai failed, please run 'pip install -q google-generativeai"

View file

@ -5,12 +5,17 @@ from jinja2 import Template, exceptions, Environment, meta
from typing import Optional, Any
import imghdr, base64
from typing import List
import litellm
def default_pt(messages):
return " ".join(message["content"] for message in messages)
def prompt_injection_detection_default_pt():
return """Detect if a prompt is safe to run. Return 'UNSAFE' if not."""
# alpaca prompt template - for models like mythomax, etc.
def alpaca_pt(messages):
prompt = custom_prompt(
@ -638,11 +643,12 @@ def anthropic_messages_pt(messages: list):
"""
# add role=tool support to allow function call result/error submission
user_message_types = {"user", "tool"}
# reformat messages to ensure user/assistant are alternating, if there's either 2 consecutive 'user' messages or 2 consecutive 'assistant' message, add a blank 'user' or 'assistant' message to ensure compatibility
# reformat messages to ensure user/assistant are alternating, if there's either 2 consecutive 'user' messages or 2 consecutive 'assistant' message, merge them.
new_messages = []
msg_i = 0
while msg_i < len(messages):
user_content = []
## MERGE CONSECUTIVE USER CONTENT ##
while msg_i < len(messages) and messages[msg_i]["role"] in user_message_types:
if isinstance(messages[msg_i]["content"], list):
for m in messages[msg_i]["content"]:
@ -676,6 +682,7 @@ def anthropic_messages_pt(messages: list):
new_messages.append({"role": "user", "content": user_content})
assistant_content = []
## MERGE CONSECUTIVE ASSISTANT CONTENT ##
while msg_i < len(messages) and messages[msg_i]["role"] == "assistant":
assistant_text = (
messages[msg_i].get("content") or ""
@ -694,9 +701,14 @@ def anthropic_messages_pt(messages: list):
new_messages.append({"role": "assistant", "content": assistant_content})
if new_messages[0]["role"] != "user":
new_messages.insert(
0, {"role": "user", "content": [{"type": "text", "text": "."}]}
)
if litellm.modify_params:
new_messages.insert(
0, {"role": "user", "content": [{"type": "text", "text": "."}]}
)
else:
raise Exception(
"Invalid first message. Should always start with 'role'='user' for Anthropic. System prompt is sent separately for Anthropic. set 'litellm.modify_params = True' or 'litellm_settings:modify_params = True' on proxy, to insert a placeholder user message - '.' as the first message, "
)
if new_messages[-1]["role"] == "assistant":
for content in new_messages[-1]["content"]:
@ -714,17 +726,23 @@ def extract_between_tags(tag: str, string: str, strip: bool = False) -> List[str
ext_list = [e.strip() for e in ext_list]
return ext_list
def contains_tag(tag: str, string: str) -> bool:
return bool(re.search(f"<{tag}>(.+?)</{tag}>", string, re.DOTALL))
def parse_xml_params(xml_content):
root = ET.fromstring(xml_content)
params = {}
for child in root.findall(".//parameters/*"):
params[child.tag] = child.text
try:
# Attempt to decode the element's text as JSON
params[child.tag] = json.loads(child.text)
except json.JSONDecodeError:
# If JSON decoding fails, use the original text
params[child.tag] = child.text
return params
###
@ -917,7 +935,7 @@ def gemini_text_image_pt(messages: list):
}
"""
try:
import google.generativeai as genai
import google.generativeai as genai # type: ignore
except:
raise Exception(
"Importing google.generativeai failed, please run 'pip install -q google-generativeai"
@ -958,9 +976,7 @@ def azure_text_pt(messages: list):
# Function call template
def function_call_prompt(messages: list, functions: list):
function_prompt = (
"""Produce JSON OUTPUT ONLY! Adhere to this format {"name": "function_name", "arguments":{"argument_name": "argument_value"}} The following functions are available to you:"""
)
function_prompt = """Produce JSON OUTPUT ONLY! Adhere to this format {"name": "function_name", "arguments":{"argument_name": "argument_value"}} The following functions are available to you:"""
for function in functions:
function_prompt += f"""\n{function}\n"""

View file

@ -166,6 +166,7 @@ def completion(
aws_secret_access_key = optional_params.pop("aws_secret_access_key", None)
aws_access_key_id = optional_params.pop("aws_access_key_id", None)
aws_region_name = optional_params.pop("aws_region_name", None)
model_id = optional_params.pop("model_id", None)
if aws_access_key_id != None:
# uses auth params passed to completion
@ -245,15 +246,28 @@ def completion(
model=model,
logging_obj=logging_obj,
data=data,
model_id=model_id,
aws_secret_access_key=aws_secret_access_key,
aws_access_key_id=aws_access_key_id,
aws_region_name=aws_region_name,
)
return response
response = client.invoke_endpoint_with_response_stream(
EndpointName=model,
ContentType="application/json",
Body=data,
CustomAttributes="accept_eula=true",
)
if model_id is not None:
response = client.invoke_endpoint_with_response_stream(
EndpointName=model,
InferenceComponentName=model_id,
ContentType="application/json",
Body=data,
CustomAttributes="accept_eula=true",
)
else:
response = client.invoke_endpoint_with_response_stream(
EndpointName=model,
ContentType="application/json",
Body=data,
CustomAttributes="accept_eula=true",
)
return response["Body"]
elif acompletion == True:
_data = {"inputs": prompt, "parameters": inference_params}
@ -264,36 +278,68 @@ def completion(
model=model,
logging_obj=logging_obj,
data=_data,
model_id=model_id,
aws_secret_access_key=aws_secret_access_key,
aws_access_key_id=aws_access_key_id,
aws_region_name=aws_region_name,
)
data = json.dumps({"inputs": prompt, "parameters": inference_params}).encode(
"utf-8"
)
## LOGGING
request_str = f"""
response = client.invoke_endpoint(
EndpointName={model},
ContentType="application/json",
Body={data},
CustomAttributes="accept_eula=true",
)
""" # type: ignore
logging_obj.pre_call(
input=prompt,
api_key="",
additional_args={
"complete_input_dict": data,
"request_str": request_str,
"hf_model_name": hf_model_name,
},
)
## COMPLETION CALL
try:
response = client.invoke_endpoint(
EndpointName=model,
ContentType="application/json",
Body=data,
CustomAttributes="accept_eula=true",
)
if model_id is not None:
## LOGGING
request_str = f"""
response = client.invoke_endpoint(
EndpointName={model},
InferenceComponentName={model_id},
ContentType="application/json",
Body={data},
CustomAttributes="accept_eula=true",
)
""" # type: ignore
logging_obj.pre_call(
input=prompt,
api_key="",
additional_args={
"complete_input_dict": data,
"request_str": request_str,
"hf_model_name": hf_model_name,
},
)
response = client.invoke_endpoint(
EndpointName=model,
InferenceComponentName=model_id,
ContentType="application/json",
Body=data,
CustomAttributes="accept_eula=true",
)
else:
## LOGGING
request_str = f"""
response = client.invoke_endpoint(
EndpointName={model},
ContentType="application/json",
Body={data},
CustomAttributes="accept_eula=true",
)
""" # type: ignore
logging_obj.pre_call(
input=prompt,
api_key="",
additional_args={
"complete_input_dict": data,
"request_str": request_str,
"hf_model_name": hf_model_name,
},
)
response = client.invoke_endpoint(
EndpointName=model,
ContentType="application/json",
Body=data,
CustomAttributes="accept_eula=true",
)
except Exception as e:
status_code = (
getattr(e, "response", {})
@ -303,6 +349,8 @@ def completion(
error_message = (
getattr(e, "response", {}).get("Error", {}).get("Message", str(e))
)
if "Inference Component Name header is required" in error_message:
error_message += "\n pass in via `litellm.completion(..., model_id={InferenceComponentName})`"
raise SagemakerError(status_code=status_code, message=error_message)
response = response["Body"].read().decode("utf8")
@ -357,8 +405,12 @@ async def async_streaming(
encoding,
model_response: ModelResponse,
model: str,
model_id: Optional[str],
logging_obj: Any,
data,
aws_secret_access_key: Optional[str],
aws_access_key_id: Optional[str],
aws_region_name: Optional[str],
):
"""
Use aioboto3
@ -367,11 +419,6 @@ async def async_streaming(
session = aioboto3.Session()
# pop aws_secret_access_key, aws_access_key_id, aws_region_name from kwargs, since completion calls fail with them
aws_secret_access_key = optional_params.pop("aws_secret_access_key", None)
aws_access_key_id = optional_params.pop("aws_access_key_id", None)
aws_region_name = optional_params.pop("aws_region_name", None)
if aws_access_key_id != None:
# uses auth params passed to completion
# aws_access_key_id is not None, assume user is trying to auth using litellm.completion
@ -398,12 +445,21 @@ async def async_streaming(
async with _client as client:
try:
response = await client.invoke_endpoint_with_response_stream(
EndpointName=model,
ContentType="application/json",
Body=data,
CustomAttributes="accept_eula=true",
)
if model_id is not None:
response = await client.invoke_endpoint_with_response_stream(
EndpointName=model,
InferenceComponentName=model_id,
ContentType="application/json",
Body=data,
CustomAttributes="accept_eula=true",
)
else:
response = await client.invoke_endpoint_with_response_stream(
EndpointName=model,
ContentType="application/json",
Body=data,
CustomAttributes="accept_eula=true",
)
except Exception as e:
raise SagemakerError(status_code=500, message=f"{str(e)}")
response = response["Body"]
@ -418,6 +474,10 @@ async def async_completion(
model: str,
logging_obj: Any,
data: dict,
model_id: Optional[str],
aws_secret_access_key: Optional[str],
aws_access_key_id: Optional[str],
aws_region_name: Optional[str],
):
"""
Use aioboto3
@ -426,11 +486,6 @@ async def async_completion(
session = aioboto3.Session()
# pop aws_secret_access_key, aws_access_key_id, aws_region_name from kwargs, since completion calls fail with them
aws_secret_access_key = optional_params.pop("aws_secret_access_key", None)
aws_access_key_id = optional_params.pop("aws_access_key_id", None)
aws_region_name = optional_params.pop("aws_region_name", None)
if aws_access_key_id != None:
# uses auth params passed to completion
# aws_access_key_id is not None, assume user is trying to auth using litellm.completion
@ -456,33 +511,63 @@ async def async_completion(
)
async with _client as client:
## LOGGING
request_str = f"""
response = client.invoke_endpoint(
EndpointName={model},
ContentType="application/json",
Body={data},
CustomAttributes="accept_eula=true",
)
""" # type: ignore
logging_obj.pre_call(
input=data["inputs"],
api_key="",
additional_args={
"complete_input_dict": data,
"request_str": request_str,
},
)
encoded_data = json.dumps(data).encode("utf-8")
try:
response = await client.invoke_endpoint(
EndpointName=model,
ContentType="application/json",
Body=encoded_data,
CustomAttributes="accept_eula=true",
)
if model_id is not None:
## LOGGING
request_str = f"""
response = client.invoke_endpoint(
EndpointName={model},
InferenceComponentName={model_id},
ContentType="application/json",
Body={data},
CustomAttributes="accept_eula=true",
)
""" # type: ignore
logging_obj.pre_call(
input=data["inputs"],
api_key="",
additional_args={
"complete_input_dict": data,
"request_str": request_str,
},
)
response = await client.invoke_endpoint(
EndpointName=model,
InferenceComponentName=model_id,
ContentType="application/json",
Body=encoded_data,
CustomAttributes="accept_eula=true",
)
else:
## LOGGING
request_str = f"""
response = client.invoke_endpoint(
EndpointName={model},
ContentType="application/json",
Body={data},
CustomAttributes="accept_eula=true",
)
""" # type: ignore
logging_obj.pre_call(
input=data["inputs"],
api_key="",
additional_args={
"complete_input_dict": data,
"request_str": request_str,
},
)
response = await client.invoke_endpoint(
EndpointName=model,
ContentType="application/json",
Body=encoded_data,
CustomAttributes="accept_eula=true",
)
except Exception as e:
raise SagemakerError(status_code=500, message=f"{str(e)}")
error_message = f"{str(e)}"
if "Inference Component Name header is required" in error_message:
error_message += "\n pass in via `litellm.completion(..., model_id={InferenceComponentName})`"
raise SagemakerError(status_code=500, message=error_message)
response = await response["Body"].read()
response = response.decode("utf8")
## LOGGING

View file

@ -289,11 +289,11 @@ def completion(
Part,
GenerationConfig,
)
from google.cloud import aiplatform
from google.cloud import aiplatform # type: ignore
from google.protobuf import json_format # type: ignore
from google.protobuf.struct_pb2 import Value # type: ignore
from google.cloud.aiplatform_v1beta1.types import content as gapic_content_types
import google.auth
from google.cloud.aiplatform_v1beta1.types import content as gapic_content_types # type: ignore
import google.auth # type: ignore
## Load credentials with the correct quota project ref: https://github.com/googleapis/python-aiplatform/issues/2557#issuecomment-1709284744
print_verbose(
@ -783,7 +783,7 @@ async def async_completion(
"""
Vertex AI Model Garden
"""
from google.cloud import aiplatform
from google.cloud import aiplatform # type: ignore
## LOGGING
logging_obj.pre_call(
@ -969,7 +969,7 @@ async def async_streaming(
)
response = llm_model.predict_streaming_async(prompt, **optional_params)
elif mode == "custom":
from google.cloud import aiplatform
from google.cloud import aiplatform # type: ignore
stream = optional_params.pop("stream", None)
@ -1059,7 +1059,7 @@ def embedding(
)
from vertexai.language_models import TextEmbeddingModel
import google.auth
import google.auth # type: ignore
## Load credentials with the correct quota project ref: https://github.com/googleapis/python-aiplatform/issues/2557#issuecomment-1709284744
try:

View file

@ -115,24 +115,54 @@ class LiteLLM:
default_headers: Optional[Mapping[str, str]] = None,
):
self.params = locals()
self.chat = Chat(self.params)
self.chat = Chat(self.params, router_obj=None)
class Chat:
def __init__(self, params):
def __init__(self, params, router_obj: Optional[Any]):
self.params = params
self.completions = Completions(self.params)
if self.params.get("acompletion", False) == True:
self.params.pop("acompletion")
self.completions: Union[AsyncCompletions, Completions] = AsyncCompletions(
self.params, router_obj=router_obj
)
else:
self.completions = Completions(self.params, router_obj=router_obj)
class Completions:
def __init__(self, params):
def __init__(self, params, router_obj: Optional[Any]):
self.params = params
self.router_obj = router_obj
def create(self, messages, model=None, **kwargs):
for k, v in kwargs.items():
self.params[k] = v
model = model or self.params.get("model")
response = completion(model=model, messages=messages, **self.params)
if self.router_obj is not None:
response = self.router_obj.completion(
model=model, messages=messages, **self.params
)
else:
response = completion(model=model, messages=messages, **self.params)
return response
class AsyncCompletions:
def __init__(self, params, router_obj: Optional[Any]):
self.params = params
self.router_obj = router_obj
async def create(self, messages, model=None, **kwargs):
for k, v in kwargs.items():
self.params[k] = v
model = model or self.params.get("model")
if self.router_obj is not None:
response = await self.router_obj.acompletion(
model=model, messages=messages, **self.params
)
else:
response = await acompletion(model=model, messages=messages, **self.params)
return response
@ -571,6 +601,7 @@ def completion(
"ttl",
"cache",
"no-log",
"base_model",
]
default_params = openai_params + litellm_params
non_default_params = {
@ -639,7 +670,7 @@ def completion(
elif (
input_cost_per_second is not None
): # time based pricing just needs cost in place
output_cost_per_second = output_cost_per_second or 0.0
output_cost_per_second = output_cost_per_second
litellm.register_model(
{
f"{custom_llm_provider}/{model}": {
@ -1752,7 +1783,11 @@ def completion(
timeout=timeout,
)
if "stream" in optional_params and optional_params["stream"] == True:
if (
"stream" in optional_params
and optional_params["stream"] == True
and not isinstance(response, CustomStreamWrapper)
):
# don't try to access stream object,
if "ai21" in model:
response = CustomStreamWrapper(
@ -2754,28 +2789,25 @@ def embedding(
model_response=EmbeddingResponse(),
)
elif custom_llm_provider == "ollama":
ollama_input = None
if isinstance(input, list) and len(input) > 1:
raise litellm.BadRequestError(
message=f"Ollama Embeddings don't support batch embeddings",
model=model, # type: ignore
llm_provider="ollama", # type: ignore
)
if isinstance(input, list) and len(input) == 1:
ollama_input = "".join(input[0])
elif isinstance(input, str):
ollama_input = input
else:
api_base = (
litellm.api_base
or api_base
or get_secret("OLLAMA_API_BASE")
or "http://localhost:11434"
)
if isinstance(input, str):
input = [input]
if not all(isinstance(item, str) for item in input):
raise litellm.BadRequestError(
message=f"Invalid input for ollama embeddings. input={input}",
model=model, # type: ignore
llm_provider="ollama", # type: ignore
)
if aembedding == True:
if aembedding:
response = ollama.ollama_aembeddings(
api_base=api_base,
model=model,
prompt=ollama_input,
prompts=input,
encoding=encoding,
logging_obj=logging,
optional_params=optional_params,

File diff suppressed because it is too large Load diff

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View file

@ -1 +1 @@
!function(){"use strict";var e,t,n,r,o,u,i,c,f,a={},l={};function d(e){var t=l[e];if(void 0!==t)return t.exports;var n=l[e]={id:e,loaded:!1,exports:{}},r=!0;try{a[e](n,n.exports,d),r=!1}finally{r&&delete l[e]}return n.loaded=!0,n.exports}d.m=a,e=[],d.O=function(t,n,r,o){if(n){o=o||0;for(var u=e.length;u>0&&e[u-1][2]>o;u--)e[u]=e[u-1];e[u]=[n,r,o];return}for(var i=1/0,u=0;u<e.length;u++){for(var n=e[u][0],r=e[u][1],o=e[u][2],c=!0,f=0;f<n.length;f++)i>=o&&Object.keys(d.O).every(function(e){return d.O[e](n[f])})?n.splice(f--,1):(c=!1,o<i&&(i=o));if(c){e.splice(u--,1);var a=r();void 0!==a&&(t=a)}}return t},d.n=function(e){var t=e&&e.__esModule?function(){return e.default}:function(){return e};return d.d(t,{a:t}),t},n=Object.getPrototypeOf?function(e){return Object.getPrototypeOf(e)}:function(e){return e.__proto__},d.t=function(e,r){if(1&r&&(e=this(e)),8&r||"object"==typeof e&&e&&(4&r&&e.__esModule||16&r&&"function"==typeof e.then))return e;var o=Object.create(null);d.r(o);var u={};t=t||[null,n({}),n([]),n(n)];for(var i=2&r&&e;"object"==typeof i&&!~t.indexOf(i);i=n(i))Object.getOwnPropertyNames(i).forEach(function(t){u[t]=function(){return e[t]}});return u.default=function(){return e},d.d(o,u),o},d.d=function(e,t){for(var n in t)d.o(t,n)&&!d.o(e,n)&&Object.defineProperty(e,n,{enumerable:!0,get:t[n]})},d.f={},d.e=function(e){return Promise.all(Object.keys(d.f).reduce(function(t,n){return d.f[n](e,t),t},[]))},d.u=function(e){},d.miniCssF=function(e){return"static/css/68a21c6e6697f7ca.css"},d.g=function(){if("object"==typeof globalThis)return globalThis;try{return this||Function("return this")()}catch(e){if("object"==typeof window)return window}}(),d.o=function(e,t){return Object.prototype.hasOwnProperty.call(e,t)},r={},o="_N_E:",d.l=function(e,t,n,u){if(r[e]){r[e].push(t);return}if(void 0!==n)for(var i,c,f=document.getElementsByTagName("script"),a=0;a<f.length;a++){var l=f[a];if(l.getAttribute("src")==e||l.getAttribute("data-webpack")==o+n){i=l;break}}i||(c=!0,(i=document.createElement("script")).charset="utf-8",i.timeout=120,d.nc&&i.setAttribute("nonce",d.nc),i.setAttribute("data-webpack",o+n),i.src=d.tu(e)),r[e]=[t];var s=function(t,n){i.onerror=i.onload=null,clearTimeout(p);var o=r[e];if(delete r[e],i.parentNode&&i.parentNode.removeChild(i),o&&o.forEach(function(e){return e(n)}),t)return t(n)},p=setTimeout(s.bind(null,void 0,{type:"timeout",target:i}),12e4);i.onerror=s.bind(null,i.onerror),i.onload=s.bind(null,i.onload),c&&document.head.appendChild(i)},d.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},d.nmd=function(e){return e.paths=[],e.children||(e.children=[]),e},d.tt=function(){return void 0===u&&(u={createScriptURL:function(e){return e}},"undefined"!=typeof trustedTypes&&trustedTypes.createPolicy&&(u=trustedTypes.createPolicy("nextjs#bundler",u))),u},d.tu=function(e){return d.tt().createScriptURL(e)},d.p="/ui/_next/",i={272:0},d.f.j=function(e,t){var n=d.o(i,e)?i[e]:void 0;if(0!==n){if(n)t.push(n[2]);else if(272!=e){var r=new Promise(function(t,r){n=i[e]=[t,r]});t.push(n[2]=r);var o=d.p+d.u(e),u=Error();d.l(o,function(t){if(d.o(i,e)&&(0!==(n=i[e])&&(i[e]=void 0),n)){var r=t&&("load"===t.type?"missing":t.type),o=t&&t.target&&t.target.src;u.message="Loading chunk "+e+" failed.\n("+r+": "+o+")",u.name="ChunkLoadError",u.type=r,u.request=o,n[1](u)}},"chunk-"+e,e)}else i[e]=0}},d.O.j=function(e){return 0===i[e]},c=function(e,t){var n,r,o=t[0],u=t[1],c=t[2],f=0;if(o.some(function(e){return 0!==i[e]})){for(n in u)d.o(u,n)&&(d.m[n]=u[n]);if(c)var a=c(d)}for(e&&e(t);f<o.length;f++)r=o[f],d.o(i,r)&&i[r]&&i[r][0](),i[r]=0;return d.O(a)},(f=self.webpackChunk_N_E=self.webpackChunk_N_E||[]).forEach(c.bind(null,0)),f.push=c.bind(null,f.push.bind(f))}();
!function(){"use strict";var e,t,n,r,o,u,i,c,f,a={},l={};function d(e){var t=l[e];if(void 0!==t)return t.exports;var n=l[e]={id:e,loaded:!1,exports:{}},r=!0;try{a[e](n,n.exports,d),r=!1}finally{r&&delete l[e]}return n.loaded=!0,n.exports}d.m=a,e=[],d.O=function(t,n,r,o){if(n){o=o||0;for(var u=e.length;u>0&&e[u-1][2]>o;u--)e[u]=e[u-1];e[u]=[n,r,o];return}for(var i=1/0,u=0;u<e.length;u++){for(var n=e[u][0],r=e[u][1],o=e[u][2],c=!0,f=0;f<n.length;f++)i>=o&&Object.keys(d.O).every(function(e){return d.O[e](n[f])})?n.splice(f--,1):(c=!1,o<i&&(i=o));if(c){e.splice(u--,1);var a=r();void 0!==a&&(t=a)}}return t},d.n=function(e){var t=e&&e.__esModule?function(){return e.default}:function(){return e};return d.d(t,{a:t}),t},n=Object.getPrototypeOf?function(e){return Object.getPrototypeOf(e)}:function(e){return e.__proto__},d.t=function(e,r){if(1&r&&(e=this(e)),8&r||"object"==typeof e&&e&&(4&r&&e.__esModule||16&r&&"function"==typeof e.then))return e;var o=Object.create(null);d.r(o);var u={};t=t||[null,n({}),n([]),n(n)];for(var i=2&r&&e;"object"==typeof i&&!~t.indexOf(i);i=n(i))Object.getOwnPropertyNames(i).forEach(function(t){u[t]=function(){return e[t]}});return u.default=function(){return e},d.d(o,u),o},d.d=function(e,t){for(var n in t)d.o(t,n)&&!d.o(e,n)&&Object.defineProperty(e,n,{enumerable:!0,get:t[n]})},d.f={},d.e=function(e){return Promise.all(Object.keys(d.f).reduce(function(t,n){return d.f[n](e,t),t},[]))},d.u=function(e){},d.miniCssF=function(e){return"static/css/f8da5a6a5b29d249.css"},d.g=function(){if("object"==typeof globalThis)return globalThis;try{return this||Function("return this")()}catch(e){if("object"==typeof window)return window}}(),d.o=function(e,t){return Object.prototype.hasOwnProperty.call(e,t)},r={},o="_N_E:",d.l=function(e,t,n,u){if(r[e]){r[e].push(t);return}if(void 0!==n)for(var i,c,f=document.getElementsByTagName("script"),a=0;a<f.length;a++){var l=f[a];if(l.getAttribute("src")==e||l.getAttribute("data-webpack")==o+n){i=l;break}}i||(c=!0,(i=document.createElement("script")).charset="utf-8",i.timeout=120,d.nc&&i.setAttribute("nonce",d.nc),i.setAttribute("data-webpack",o+n),i.src=d.tu(e)),r[e]=[t];var s=function(t,n){i.onerror=i.onload=null,clearTimeout(p);var o=r[e];if(delete r[e],i.parentNode&&i.parentNode.removeChild(i),o&&o.forEach(function(e){return e(n)}),t)return t(n)},p=setTimeout(s.bind(null,void 0,{type:"timeout",target:i}),12e4);i.onerror=s.bind(null,i.onerror),i.onload=s.bind(null,i.onload),c&&document.head.appendChild(i)},d.r=function(e){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},d.nmd=function(e){return e.paths=[],e.children||(e.children=[]),e},d.tt=function(){return void 0===u&&(u={createScriptURL:function(e){return e}},"undefined"!=typeof trustedTypes&&trustedTypes.createPolicy&&(u=trustedTypes.createPolicy("nextjs#bundler",u))),u},d.tu=function(e){return d.tt().createScriptURL(e)},d.p="/ui/_next/",i={272:0},d.f.j=function(e,t){var n=d.o(i,e)?i[e]:void 0;if(0!==n){if(n)t.push(n[2]);else if(272!=e){var r=new Promise(function(t,r){n=i[e]=[t,r]});t.push(n[2]=r);var o=d.p+d.u(e),u=Error();d.l(o,function(t){if(d.o(i,e)&&(0!==(n=i[e])&&(i[e]=void 0),n)){var r=t&&("load"===t.type?"missing":t.type),o=t&&t.target&&t.target.src;u.message="Loading chunk "+e+" failed.\n("+r+": "+o+")",u.name="ChunkLoadError",u.type=r,u.request=o,n[1](u)}},"chunk-"+e,e)}else i[e]=0}},d.O.j=function(e){return 0===i[e]},c=function(e,t){var n,r,o=t[0],u=t[1],c=t[2],f=0;if(o.some(function(e){return 0!==i[e]})){for(n in u)d.o(u,n)&&(d.m[n]=u[n]);if(c)var a=c(d)}for(e&&e(t);f<o.length;f++)r=o[f],d.o(i,r)&&i[r]&&i[r][0](),i[r]=0;return d.O(a)},(f=self.webpackChunk_N_E=self.webpackChunk_N_E||[]).forEach(c.bind(null,0)),f.push=c.bind(null,f.push.bind(f))}();

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View file

@ -1 +1 @@
<!DOCTYPE html><html id="__next_error__"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-3b0d290a8fe6941d.js" crossorigin=""/><script src="/ui/_next/static/chunks/fd9d1056-a85b2c176012d8e5.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/69-e1b183dda365ec86.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/main-app-9b4fb13a7db53edf.js" async="" crossorigin=""></script><title>🚅 LiteLLM</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js" crossorigin="" noModule=""></script></head><body><script src="/ui/_next/static/chunks/webpack-3b0d290a8fe6941d.js" crossorigin="" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/68a21c6e6697f7ca.css\",\"style\",{\"crossOrigin\":\"\"}]\n0:\"$L3\"\n"])</script><script>self.__next_f.push([1,"4:I[47690,[],\"\"]\n6:I[77831,[],\"\"]\n7:I[19914,[\"730\",\"static/chunks/730-1411b729a1c79695.js\",\"931\",\"static/chunks/app/page-df9015da04018cc1.js\"],\"\"]\n8:I[5613,[],\"\"]\n9:I[31778,[],\"\"]\nb:I[48955,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"3:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/68a21c6e6697f7ca.css\",\"precedence\":\"next\",\"crossOrigin\":\"\"}]],[\"$\",\"$L4\",null,{\"buildId\":\"tXZFkeqtgh-goIRVbw_9q\",\"assetPrefix\":\"/ui\",\"initialCanonicalUrl\":\"/\",\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[\"$L5\",[\"$\",\"$L6\",null,{\"propsForComponent\":{\"params\":{}},\"Component\":\"$7\",\"isStaticGeneration\":true}],null]]},[null,[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_c23dc8\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[],\"styles\":null}]}]}],null]],\"initialHead\":[false,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"🚅 LiteLLM\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script><script>self.__next_f.push([1,""])</script></body></html>
<!DOCTYPE html><html id="__next_error__"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="preload" as="script" fetchPriority="low" href="/ui/_next/static/chunks/webpack-d1ad37b1875df240.js" crossorigin=""/><script src="/ui/_next/static/chunks/fd9d1056-a507ee9e75a3be72.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/69-589b47e7a69d316f.js" async="" crossorigin=""></script><script src="/ui/_next/static/chunks/main-app-9b4fb13a7db53edf.js" async="" crossorigin=""></script><title>🚅 LiteLLM</title><meta name="description" content="LiteLLM Proxy Admin UI"/><link rel="icon" href="/ui/favicon.ico" type="image/x-icon" sizes="16x16"/><meta name="next-size-adjust"/><script src="/ui/_next/static/chunks/polyfills-c67a75d1b6f99dc8.js" crossorigin="" noModule=""></script></head><body><script src="/ui/_next/static/chunks/webpack-d1ad37b1875df240.js" crossorigin="" async=""></script><script>(self.__next_f=self.__next_f||[]).push([0]);self.__next_f.push([2,null])</script><script>self.__next_f.push([1,"1:HL[\"/ui/_next/static/media/c9a5bc6a7c948fb0-s.p.woff2\",\"font\",{\"crossOrigin\":\"\",\"type\":\"font/woff2\"}]\n2:HL[\"/ui/_next/static/css/f8da5a6a5b29d249.css\",\"style\",{\"crossOrigin\":\"\"}]\n0:\"$L3\"\n"])</script><script>self.__next_f.push([1,"4:I[47690,[],\"\"]\n6:I[77831,[],\"\"]\n7:I[90177,[\"798\",\"static/chunks/798-4baed68da0c5497d.js\",\"931\",\"static/chunks/app/page-a5a04da2a9356785.js\"],\"\"]\n8:I[5613,[],\"\"]\n9:I[31778,[],\"\"]\nb:I[48955,[],\"\"]\nc:[]\n"])</script><script>self.__next_f.push([1,"3:[[[\"$\",\"link\",\"0\",{\"rel\":\"stylesheet\",\"href\":\"/ui/_next/static/css/f8da5a6a5b29d249.css\",\"precedence\":\"next\",\"crossOrigin\":\"\"}]],[\"$\",\"$L4\",null,{\"buildId\":\"DptMjzo5xd96cx0b56k4u\",\"assetPrefix\":\"/ui\",\"initialCanonicalUrl\":\"/\",\"initialTree\":[\"\",{\"children\":[\"__PAGE__\",{}]},\"$undefined\",\"$undefined\",true],\"initialSeedData\":[\"\",{\"children\":[\"__PAGE__\",{},[\"$L5\",[\"$\",\"$L6\",null,{\"propsForComponent\":{\"params\":{}},\"Component\":\"$7\",\"isStaticGeneration\":true}],null]]},[null,[\"$\",\"html\",null,{\"lang\":\"en\",\"children\":[\"$\",\"body\",null,{\"className\":\"__className_c23dc8\",\"children\":[\"$\",\"$L8\",null,{\"parallelRouterKey\":\"children\",\"segmentPath\":[\"children\"],\"loading\":\"$undefined\",\"loadingStyles\":\"$undefined\",\"loadingScripts\":\"$undefined\",\"hasLoading\":false,\"error\":\"$undefined\",\"errorStyles\":\"$undefined\",\"errorScripts\":\"$undefined\",\"template\":[\"$\",\"$L9\",null,{}],\"templateStyles\":\"$undefined\",\"templateScripts\":\"$undefined\",\"notFound\":[[\"$\",\"title\",null,{\"children\":\"404: This page could not be found.\"}],[\"$\",\"div\",null,{\"style\":{\"fontFamily\":\"system-ui,\\\"Segoe UI\\\",Roboto,Helvetica,Arial,sans-serif,\\\"Apple Color Emoji\\\",\\\"Segoe UI Emoji\\\"\",\"height\":\"100vh\",\"textAlign\":\"center\",\"display\":\"flex\",\"flexDirection\":\"column\",\"alignItems\":\"center\",\"justifyContent\":\"center\"},\"children\":[\"$\",\"div\",null,{\"children\":[[\"$\",\"style\",null,{\"dangerouslySetInnerHTML\":{\"__html\":\"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}\"}}],[\"$\",\"h1\",null,{\"className\":\"next-error-h1\",\"style\":{\"display\":\"inline-block\",\"margin\":\"0 20px 0 0\",\"padding\":\"0 23px 0 0\",\"fontSize\":24,\"fontWeight\":500,\"verticalAlign\":\"top\",\"lineHeight\":\"49px\"},\"children\":\"404\"}],[\"$\",\"div\",null,{\"style\":{\"display\":\"inline-block\"},\"children\":[\"$\",\"h2\",null,{\"style\":{\"fontSize\":14,\"fontWeight\":400,\"lineHeight\":\"49px\",\"margin\":0},\"children\":\"This page could not be found.\"}]}]]}]}]],\"notFoundStyles\":[],\"styles\":null}]}]}],null]],\"initialHead\":[false,\"$La\"],\"globalErrorComponent\":\"$b\",\"missingSlots\":\"$Wc\"}]]\n"])</script><script>self.__next_f.push([1,"a:[[\"$\",\"meta\",\"0\",{\"name\":\"viewport\",\"content\":\"width=device-width, initial-scale=1\"}],[\"$\",\"meta\",\"1\",{\"charSet\":\"utf-8\"}],[\"$\",\"title\",\"2\",{\"children\":\"🚅 LiteLLM\"}],[\"$\",\"meta\",\"3\",{\"name\":\"description\",\"content\":\"LiteLLM Proxy Admin UI\"}],[\"$\",\"link\",\"4\",{\"rel\":\"icon\",\"href\":\"/ui/favicon.ico\",\"type\":\"image/x-icon\",\"sizes\":\"16x16\"}],[\"$\",\"meta\",\"5\",{\"name\":\"next-size-adjust\"}]]\n5:null\n"])</script><script>self.__next_f.push([1,""])</script></body></html>

View file

@ -1,7 +1,7 @@
2:I[77831,[],""]
3:I[19914,["730","static/chunks/730-1411b729a1c79695.js","931","static/chunks/app/page-df9015da04018cc1.js"],""]
3:I[90177,["798","static/chunks/798-4baed68da0c5497d.js","931","static/chunks/app/page-a5a04da2a9356785.js"],""]
4:I[5613,[],""]
5:I[31778,[],""]
0:["tXZFkeqtgh-goIRVbw_9q",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_c23dc8","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/68a21c6e6697f7ca.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
0:["DptMjzo5xd96cx0b56k4u",[[["",{"children":["__PAGE__",{}]},"$undefined","$undefined",true],["",{"children":["__PAGE__",{},["$L1",["$","$L2",null,{"propsForComponent":{"params":{}},"Component":"$3","isStaticGeneration":true}],null]]},[null,["$","html",null,{"lang":"en","children":["$","body",null,{"className":"__className_c23dc8","children":["$","$L4",null,{"parallelRouterKey":"children","segmentPath":["children"],"loading":"$undefined","loadingStyles":"$undefined","loadingScripts":"$undefined","hasLoading":false,"error":"$undefined","errorStyles":"$undefined","errorScripts":"$undefined","template":["$","$L5",null,{}],"templateStyles":"$undefined","templateScripts":"$undefined","notFound":[["$","title",null,{"children":"404: This page could not be found."}],["$","div",null,{"style":{"fontFamily":"system-ui,\"Segoe UI\",Roboto,Helvetica,Arial,sans-serif,\"Apple Color Emoji\",\"Segoe UI Emoji\"","height":"100vh","textAlign":"center","display":"flex","flexDirection":"column","alignItems":"center","justifyContent":"center"},"children":["$","div",null,{"children":[["$","style",null,{"dangerouslySetInnerHTML":{"__html":"body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}"}}],["$","h1",null,{"className":"next-error-h1","style":{"display":"inline-block","margin":"0 20px 0 0","padding":"0 23px 0 0","fontSize":24,"fontWeight":500,"verticalAlign":"top","lineHeight":"49px"},"children":"404"}],["$","div",null,{"style":{"display":"inline-block"},"children":["$","h2",null,{"style":{"fontSize":14,"fontWeight":400,"lineHeight":"49px","margin":0},"children":"This page could not be found."}]}]]}]}]],"notFoundStyles":[],"styles":null}]}]}],null]],[[["$","link","0",{"rel":"stylesheet","href":"/ui/_next/static/css/f8da5a6a5b29d249.css","precedence":"next","crossOrigin":""}]],"$L6"]]]]
6:[["$","meta","0",{"name":"viewport","content":"width=device-width, initial-scale=1"}],["$","meta","1",{"charSet":"utf-8"}],["$","title","2",{"children":"🚅 LiteLLM"}],["$","meta","3",{"name":"description","content":"LiteLLM Proxy Admin UI"}],["$","link","4",{"rel":"icon","href":"/ui/favicon.ico","type":"image/x-icon","sizes":"16x16"}],["$","meta","5",{"name":"next-size-adjust"}]]
1:null

View file

@ -1,21 +1,20 @@
model_list:
- model_name: fake_openai
- model_name: fake-openai-endpoint
litellm_params:
model: openai/my-fake-model
api_key: my-fake-key
api_base: http://0.0.0.0:8080
- model_name: gpt-3.5-turbo
litellm_params:
model: gpt-3.5-turbo-1106
api_key: os.environ/OPENAI_API_KEY
api_base: https://exampleopenaiendpoint-production.up.railway.app/
litellm_settings:
cache: true
cache_params:
type: redis
callbacks: ["batch_redis_requests"]
# success_callbacks: ["langfuse"]
max_budget: 600020
budget_duration: 30d
general_settings:
master_key: sk-1234
database_url: "postgresql://neondb_owner:hz8tyUlJ5ivV@ep-cool-sunset-a5ywubeh.us-east-2.aws.neon.tech/neondb?sslmode=require"
proxy_batch_write_at: 60 # 👈 Frequency of batch writing logs to server (in seconds)
enable_jwt_auth: True
alerting: ["slack"]
litellm_jwtauth:
admin_jwt_scope: "litellm_proxy_admin"
team_jwt_scope: "litellm_team"
public_key_ttl: 600

View file

@ -1,4 +1,5 @@
from pydantic import BaseModel, Extra, Field, root_validator, Json
from pydantic import BaseModel, Extra, Field, root_validator, Json, validator
from dataclasses import fields
import enum
from typing import Optional, List, Union, Dict, Literal, Any
from datetime import datetime
@ -14,11 +15,6 @@ def hash_token(token: str):
return hashed_token
class LiteLLMProxyRoles(enum.Enum):
PROXY_ADMIN = "litellm_proxy_admin"
USER = "litellm_user"
class LiteLLMBase(BaseModel):
"""
Implements default functions, all pydantic objects should have.
@ -42,6 +38,135 @@ class LiteLLMBase(BaseModel):
protected_namespaces = ()
class LiteLLMRoutes(enum.Enum):
openai_routes: List = [ # chat completions
"/openai/deployments/{model}/chat/completions",
"/chat/completions",
"/v1/chat/completions",
# completions
"/openai/deployments/{model}/completions",
"/completions",
"/v1/completions",
# embeddings
"/openai/deployments/{model}/embeddings",
"/embeddings",
"/v1/embeddings",
# image generation
"/images/generations",
"/v1/images/generations",
# audio transcription
"/audio/transcriptions",
"/v1/audio/transcriptions",
# moderations
"/moderations",
"/v1/moderations",
# models
"/models",
"/v1/models",
]
info_routes: List = ["/key/info", "/team/info", "/user/info", "/model/info"]
management_routes: List = [ # key
"/key/generate",
"/key/update",
"/key/delete",
"/key/info",
# user
"/user/new",
"/user/update",
"/user/delete",
"/user/info",
# team
"/team/new",
"/team/update",
"/team/delete",
"/team/info",
"/team/block",
"/team/unblock",
# model
"/model/new",
"/model/update",
"/model/delete",
"/model/info",
]
class LiteLLM_JWTAuth(LiteLLMBase):
"""
A class to define the roles and permissions for a LiteLLM Proxy w/ JWT Auth.
Attributes:
- admin_jwt_scope: The JWT scope required for proxy admin roles.
- admin_allowed_routes: list of allowed routes for proxy admin roles.
- team_jwt_scope: The JWT scope required for proxy team roles.
- team_id_jwt_field: The field in the JWT token that stores the team ID. Default - `client_id`.
- team_allowed_routes: list of allowed routes for proxy team roles.
- end_user_id_jwt_field: Default - `sub`. The field in the JWT token that stores the end-user ID. Turn this off by setting to `None`. Enables end-user cost tracking.
- public_key_ttl: Default - 600s. TTL for caching public JWT keys.
See `auth_checks.py` for the specific routes
"""
admin_jwt_scope: str = "litellm_proxy_admin"
admin_allowed_routes: List[
Literal["openai_routes", "info_routes", "management_routes"]
] = ["management_routes"]
team_jwt_scope: str = "litellm_team"
team_id_jwt_field: str = "client_id"
team_allowed_routes: List[
Literal["openai_routes", "info_routes", "management_routes"]
] = ["openai_routes", "info_routes"]
end_user_id_jwt_field: Optional[str] = "sub"
public_key_ttl: float = 600
def __init__(self, **kwargs: Any) -> None:
# get the attribute names for this Pydantic model
allowed_keys = self.__annotations__.keys()
invalid_keys = set(kwargs.keys()) - allowed_keys
if invalid_keys:
raise ValueError(
f"Invalid arguments provided: {', '.join(invalid_keys)}. Allowed arguments are: {', '.join(allowed_keys)}."
)
super().__init__(**kwargs)
class LiteLLMPromptInjectionParams(LiteLLMBase):
heuristics_check: bool = False
vector_db_check: bool = False
llm_api_check: bool = False
llm_api_name: Optional[str] = None
llm_api_system_prompt: Optional[str] = None
llm_api_fail_call_string: Optional[str] = None
@root_validator(pre=True)
def check_llm_api_params(cls, values):
llm_api_check = values.get("llm_api_check")
if llm_api_check is True:
if "llm_api_name" not in values or not values["llm_api_name"]:
raise ValueError(
"If llm_api_check is set to True, llm_api_name must be provided"
)
if (
"llm_api_system_prompt" not in values
or not values["llm_api_system_prompt"]
):
raise ValueError(
"If llm_api_check is set to True, llm_api_system_prompt must be provided"
)
if (
"llm_api_fail_call_string" not in values
or not values["llm_api_fail_call_string"]
):
raise ValueError(
"If llm_api_check is set to True, llm_api_fail_call_string must be provided"
)
return values
######### Request Class Definition ######
class ProxyChatCompletionRequest(LiteLLMBase):
model: str
@ -180,7 +305,7 @@ class GenerateKeyResponse(GenerateKeyRequest):
key: str
key_name: Optional[str] = None
expires: Optional[datetime]
user_id: str
user_id: Optional[str] = None
@root_validator(pre=True)
def set_model_info(cls, values):
@ -274,6 +399,7 @@ class TeamBase(LiteLLMBase):
rpm_limit: Optional[int] = None
max_budget: Optional[float] = None
models: list = []
blocked: bool = False
class NewTeamRequest(TeamBase):
@ -301,19 +427,18 @@ class TeamMemberDeleteRequest(LiteLLMBase):
return values
class UpdateTeamRequest(LiteLLMBase):
class UpdateTeamRequest(TeamBase):
team_id: str # required
team_alias: Optional[str] = None
admins: Optional[list] = None
members: Optional[list] = None
members_with_roles: Optional[List[Member]] = None
metadata: Optional[dict] = None
class DeleteTeamRequest(LiteLLMBase):
team_ids: List[str] # required
class BlockTeamRequest(LiteLLMBase):
team_id: str # required
class LiteLLM_TeamTable(TeamBase):
spend: Optional[float] = None
max_parallel_requests: Optional[int] = None
@ -498,6 +623,9 @@ class ConfigGeneralSettings(LiteLLMBase):
ui_access_mode: Optional[Literal["admin_only", "all"]] = Field(
"all", description="Control access to the Proxy UI"
)
allowed_routes: Optional[List] = Field(
None, description="Proxy API Endpoints you want users to be able to access"
)
class ConfigYAML(LiteLLMBase):
@ -565,6 +693,8 @@ class LiteLLM_VerificationTokenView(LiteLLM_VerificationToken):
team_tpm_limit: Optional[int] = None
team_rpm_limit: Optional[int] = None
team_max_budget: Optional[float] = None
team_models: List = []
team_blocked: bool = False
soft_budget: Optional[float] = None
team_model_aliases: Optional[Dict] = None

View file

@ -8,45 +8,160 @@ Run checks for:
2. If user is in budget
3. If end_user ('user' passed to /chat/completions, /embeddings endpoint) is in budget
"""
from litellm.proxy._types import LiteLLM_UserTable, LiteLLM_EndUserTable
from typing import Optional
from litellm.proxy._types import (
LiteLLM_UserTable,
LiteLLM_EndUserTable,
LiteLLM_JWTAuth,
LiteLLM_TeamTable,
LiteLLMRoutes,
)
from typing import Optional, Literal, Union
from litellm.proxy.utils import PrismaClient
from litellm.caching import DualCache
import litellm
all_routes = LiteLLMRoutes.openai_routes.value + LiteLLMRoutes.management_routes.value
def common_checks(
request_body: dict,
user_object: LiteLLM_UserTable,
team_object: LiteLLM_TeamTable,
end_user_object: Optional[LiteLLM_EndUserTable],
global_proxy_spend: Optional[float],
general_settings: dict,
route: str,
) -> bool:
"""
Common checks across jwt + key-based auth.
1. If team is blocked
2. If team can call model
3. If team is in budget
4. If end_user ('user' passed to /chat/completions, /embeddings endpoint) is in budget
5. [OPTIONAL] If 'enforce_end_user' enabled - did developer pass in 'user' param for openai endpoints
6. [OPTIONAL] If 'litellm.max_budget' is set (>0), is proxy under budget
"""
_model = request_body.get("model", None)
# 1. If user can call model
if team_object.blocked == True:
raise Exception(
f"Team={team_object.team_id} is blocked. Update via `/team/unblock` if your admin."
)
# 2. If user can call model
if (
_model is not None
and len(user_object.models) > 0
and _model not in user_object.models
and len(team_object.models) > 0
and _model not in team_object.models
):
raise Exception(
f"User={user_object.user_id} not allowed to call model={_model}. Allowed user models = {user_object.models}"
f"Team={team_object.team_id} not allowed to call model={_model}. Allowed team models = {team_object.models}"
)
# 2. If user is in budget
# 3. If team is in budget
if (
user_object.max_budget is not None
and user_object.spend > user_object.max_budget
team_object.max_budget is not None
and team_object.spend is not None
and team_object.spend > team_object.max_budget
):
raise Exception(
f"User={user_object.user_id} over budget. Spend={user_object.spend}, Budget={user_object.max_budget}"
f"Team={team_object.team_id} over budget. Spend={team_object.spend}, Budget={team_object.max_budget}"
)
# 3. If end_user ('user' passed to /chat/completions, /embeddings endpoint) is in budget
# 4. If end_user ('user' passed to /chat/completions, /embeddings endpoint) is in budget
if end_user_object is not None and end_user_object.litellm_budget_table is not None:
end_user_budget = end_user_object.litellm_budget_table.max_budget
if end_user_budget is not None and end_user_object.spend > end_user_budget:
raise Exception(
f"End User={end_user_object.user_id} over budget. Spend={end_user_object.spend}, Budget={end_user_budget}"
f"ExceededBudget: End User={end_user_object.user_id} over budget. Spend={end_user_object.spend}, Budget={end_user_budget}"
)
# 5. [OPTIONAL] If 'enforce_user_param' enabled - did developer pass in 'user' param for openai endpoints
if (
general_settings.get("enforce_user_param", None) is not None
and general_settings["enforce_user_param"] == True
):
if route in LiteLLMRoutes.openai_routes.value and "user" not in request_body:
raise Exception(
f"'user' param not passed in. 'enforce_user_param'={general_settings['enforce_user_param']}"
)
# 6. [OPTIONAL] If 'litellm.max_budget' is set (>0), is proxy under budget
if litellm.max_budget > 0 and global_proxy_spend is not None:
if global_proxy_spend > litellm.max_budget:
raise Exception(
f"ExceededBudget: LiteLLM Proxy has exceeded its budget. Current spend: {global_proxy_spend}; Max Budget: {litellm.max_budget}"
)
return True
def _allowed_routes_check(user_route: str, allowed_routes: list) -> bool:
for allowed_route in allowed_routes:
if (
allowed_route == LiteLLMRoutes.openai_routes.name
and user_route in LiteLLMRoutes.openai_routes.value
):
return True
elif (
allowed_route == LiteLLMRoutes.info_routes.name
and user_route in LiteLLMRoutes.info_routes.value
):
return True
elif (
allowed_route == LiteLLMRoutes.management_routes.name
and user_route in LiteLLMRoutes.management_routes.value
):
return True
elif allowed_route == user_route:
return True
return False
def allowed_routes_check(
user_role: Literal["proxy_admin", "team"],
user_route: str,
litellm_proxy_roles: LiteLLM_JWTAuth,
) -> bool:
"""
Check if user -> not admin - allowed to access these routes
"""
if user_role == "proxy_admin":
if litellm_proxy_roles.admin_allowed_routes is None:
is_allowed = _allowed_routes_check(
user_route=user_route, allowed_routes=["management_routes"]
)
return is_allowed
elif litellm_proxy_roles.admin_allowed_routes is not None:
is_allowed = _allowed_routes_check(
user_route=user_route,
allowed_routes=litellm_proxy_roles.admin_allowed_routes,
)
return is_allowed
elif user_role == "team":
if litellm_proxy_roles.team_allowed_routes is None:
"""
By default allow a team to call openai + info routes
"""
is_allowed = _allowed_routes_check(
user_route=user_route, allowed_routes=["openai_routes", "info_routes"]
)
return is_allowed
elif litellm_proxy_roles.team_allowed_routes is not None:
is_allowed = _allowed_routes_check(
user_route=user_route,
allowed_routes=litellm_proxy_roles.team_allowed_routes,
)
return is_allowed
return False
def get_actual_routes(allowed_routes: list) -> list:
actual_routes: list = []
for route_name in allowed_routes:
try:
route_value = LiteLLMRoutes[route_name].value
actual_routes = actual_routes + route_value
except KeyError:
actual_routes.append(route_name)
return actual_routes
async def get_end_user_object(
end_user_id: Optional[str],
prisma_client: Optional[PrismaClient],
@ -82,3 +197,75 @@ async def get_end_user_object(
return LiteLLM_EndUserTable(**response.dict())
except Exception as e: # if end-user not in db
return None
async def get_user_object(self, user_id: str) -> LiteLLM_UserTable:
"""
- Check if user id in proxy User Table
- if valid, return LiteLLM_UserTable object with defined limits
- if not, then raise an error
"""
if self.prisma_client is None:
raise Exception(
"No DB Connected. See - https://docs.litellm.ai/docs/proxy/virtual_keys"
)
# check if in cache
cached_user_obj = self.user_api_key_cache.async_get_cache(key=user_id)
if cached_user_obj is not None:
if isinstance(cached_user_obj, dict):
return LiteLLM_UserTable(**cached_user_obj)
elif isinstance(cached_user_obj, LiteLLM_UserTable):
return cached_user_obj
# else, check db
try:
response = await self.prisma_client.db.litellm_usertable.find_unique(
where={"user_id": user_id}
)
if response is None:
raise Exception
return LiteLLM_UserTable(**response.dict())
except Exception as e:
raise Exception(
f"User doesn't exist in db. User={user_id}. Create user via `/user/new` call."
)
async def get_team_object(
team_id: str,
prisma_client: Optional[PrismaClient],
user_api_key_cache: DualCache,
) -> LiteLLM_TeamTable:
"""
- Check if team id in proxy Team Table
- if valid, return LiteLLM_TeamTable object with defined limits
- if not, then raise an error
"""
if prisma_client is None:
raise Exception(
"No DB Connected. See - https://docs.litellm.ai/docs/proxy/virtual_keys"
)
# check if in cache
cached_team_obj = user_api_key_cache.async_get_cache(key=team_id)
if cached_team_obj is not None:
if isinstance(cached_team_obj, dict):
return LiteLLM_TeamTable(**cached_team_obj)
elif isinstance(cached_team_obj, LiteLLM_TeamTable):
return cached_team_obj
# else, check db
try:
response = await prisma_client.db.litellm_teamtable.find_unique(
where={"team_id": team_id}
)
if response is None:
raise Exception
return LiteLLM_TeamTable(**response.dict())
except Exception as e:
raise Exception(
f"Team doesn't exist in db. Team={team_id}. Create team via `/team/new` call."
)

View file

@ -6,50 +6,17 @@ Currently only supports admin.
JWT token must have 'litellm_proxy_admin' in scope.
"""
import httpx
import jwt
from jwt.algorithms import RSAAlgorithm
import json
import os
from litellm.caching import DualCache
from litellm.proxy._types import LiteLLMProxyRoles, LiteLLM_UserTable
from litellm._logging import verbose_proxy_logger
from litellm.proxy._types import LiteLLM_JWTAuth, LiteLLM_UserTable
from litellm.proxy.utils import PrismaClient
from litellm.llms.custom_httpx.httpx_handler import HTTPHandler
from typing import Optional
class HTTPHandler:
def __init__(self, concurrent_limit=1000):
# Create a client with a connection pool
self.client = httpx.AsyncClient(
limits=httpx.Limits(
max_connections=concurrent_limit,
max_keepalive_connections=concurrent_limit,
)
)
async def close(self):
# Close the client when you're done with it
await self.client.aclose()
async def get(
self, url: str, params: Optional[dict] = None, headers: Optional[dict] = None
):
response = await self.client.get(url, params=params, headers=headers)
return response
async def post(
self,
url: str,
data: Optional[dict] = None,
params: Optional[dict] = None,
headers: Optional[dict] = None,
):
response = await self.client.post(
url, data=data, params=params, headers=headers
)
return response
class JWTHandler:
"""
- treat the sub id passed in as the user id
@ -67,113 +34,139 @@ class JWTHandler:
self.http_handler = HTTPHandler()
def update_environment(
self, prisma_client: Optional[PrismaClient], user_api_key_cache: DualCache
self,
prisma_client: Optional[PrismaClient],
user_api_key_cache: DualCache,
litellm_jwtauth: LiteLLM_JWTAuth,
) -> None:
self.prisma_client = prisma_client
self.user_api_key_cache = user_api_key_cache
self.litellm_jwtauth = litellm_jwtauth
def is_jwt(self, token: str):
parts = token.split(".")
return len(parts) == 3
def is_admin(self, scopes: list) -> bool:
if LiteLLMProxyRoles.PROXY_ADMIN.value in scopes:
if self.litellm_jwtauth.admin_jwt_scope in scopes:
return True
return False
def get_user_id(self, token: dict, default_value: str) -> str:
def is_team(self, scopes: list) -> bool:
if self.litellm_jwtauth.team_jwt_scope in scopes:
return True
return False
def get_end_user_id(self, token: dict, default_value: Optional[str]) -> str:
try:
user_id = token["sub"]
if self.litellm_jwtauth.end_user_id_jwt_field is not None:
user_id = token[self.litellm_jwtauth.end_user_id_jwt_field]
else:
user_id = None
except KeyError:
user_id = default_value
return user_id
def get_team_id(self, token: dict, default_value: Optional[str]) -> Optional[str]:
try:
team_id = token["azp"]
team_id = token[self.litellm_jwtauth.team_id_jwt_field]
except KeyError:
team_id = default_value
return team_id
async def get_user_object(self, user_id: str) -> LiteLLM_UserTable:
"""
- Check if user id in proxy User Table
- if valid, return LiteLLM_UserTable object with defined limits
- if not, then raise an error
"""
if self.prisma_client is None:
raise Exception(
"No DB Connected. See - https://docs.litellm.ai/docs/proxy/virtual_keys"
)
# check if in cache
cached_user_obj = self.user_api_key_cache.async_get_cache(key=user_id)
if cached_user_obj is not None:
if isinstance(cached_user_obj, dict):
return LiteLLM_UserTable(**cached_user_obj)
elif isinstance(cached_user_obj, LiteLLM_UserTable):
return cached_user_obj
# else, check db
try:
response = await self.prisma_client.db.litellm_usertable.find_unique(
where={"user_id": user_id}
)
if response is None:
raise Exception
return LiteLLM_UserTable(**response.dict())
except Exception as e:
raise Exception(
f"User doesn't exist in db. User={user_id}. Create user via `/user/new` call."
)
def get_scopes(self, token: dict) -> list:
try:
# Assuming the scopes are stored in 'scope' claim and are space-separated
scopes = token["scope"].split()
if isinstance(token["scope"], str):
# Assuming the scopes are stored in 'scope' claim and are space-separated
scopes = token["scope"].split()
elif isinstance(token["scope"], list):
scopes = token["scope"]
else:
raise Exception(
f"Unmapped scope type - {type(token['scope'])}. Supported types - list, str."
)
except KeyError:
scopes = []
return scopes
async def auth_jwt(self, token: str) -> dict:
async def get_public_key(self, kid: Optional[str]) -> dict:
keys_url = os.getenv("JWT_PUBLIC_KEY_URL")
if keys_url is None:
raise Exception("Missing JWT Public Key URL from environment.")
response = await self.http_handler.get(keys_url)
cached_keys = await self.user_api_key_cache.async_get_cache(
"litellm_jwt_auth_keys"
)
if cached_keys is None:
response = await self.http_handler.get(keys_url)
keys = response.json()["keys"]
keys = response.json()["keys"]
await self.user_api_key_cache.async_set_cache(
key="litellm_jwt_auth_keys",
value=keys,
ttl=self.litellm_jwtauth.public_key_ttl, # cache for 10 mins
)
else:
keys = cached_keys
public_key: Optional[dict] = None
if len(keys) == 1:
if kid is None or key["kid"] == kid:
public_key = keys[0]
elif len(keys) > 1:
for key in keys:
if kid is not None and key["kid"] == kid:
public_key = key
if public_key is None:
raise Exception(
f"No matching public key found. kid={kid}, keys_url={keys_url}, cached_keys={cached_keys}"
)
return public_key
async def auth_jwt(self, token: str) -> dict:
from jwt.algorithms import RSAAlgorithm
header = jwt.get_unverified_header(token)
kid = header["kid"]
for key in keys:
if key["kid"] == kid:
jwk = {
"kty": key["kty"],
"kid": key["kid"],
"n": key["n"],
"e": key["e"],
}
public_key = RSAAlgorithm.from_jwk(json.dumps(jwk))
verbose_proxy_logger.debug("header: %s", header)
try:
# decode the token using the public key
payload = jwt.decode(
token,
public_key, # type: ignore
algorithms=["RS256"],
audience="account",
)
return payload
kid = header.get("kid", None)
except jwt.ExpiredSignatureError:
# the token is expired, do something to refresh it
raise Exception("Token Expired")
except Exception as e:
raise Exception(f"Validation fails: {str(e)}")
public_key = await self.get_public_key(kid=kid)
if public_key is not None and isinstance(public_key, dict):
jwk = {}
if "kty" in public_key:
jwk["kty"] = public_key["kty"]
if "kid" in public_key:
jwk["kid"] = public_key["kid"]
if "n" in public_key:
jwk["n"] = public_key["n"]
if "e" in public_key:
jwk["e"] = public_key["e"]
public_key_rsa = RSAAlgorithm.from_jwk(json.dumps(jwk))
try:
# decode the token using the public key
payload = jwt.decode(
token,
public_key_rsa, # type: ignore
algorithms=["RS256"],
options={"verify_aud": False},
)
return payload
except jwt.ExpiredSignatureError:
# the token is expired, do something to refresh it
raise Exception("Token Expired")
except Exception as e:
raise Exception(f"Validation fails: {str(e)}")
raise Exception("Invalid JWT Submitted")

Some files were not shown because too many files have changed in this diff Show more